From 11b48333b46ecd464cc3979de66038c87717e8d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Fourrier?= <22726840+clefourrier@users.noreply.github.com> Date: Mon, 22 Apr 2024 18:25:56 +0200 Subject: [PATCH 01/25] no function we actually use should be named prompt_fn (#168) --- src/lighteval/tasks/extended/mt_bench/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lighteval/tasks/extended/mt_bench/main.py b/src/lighteval/tasks/extended/mt_bench/main.py index 8da44e25..ec8347b7 100644 --- a/src/lighteval/tasks/extended/mt_bench/main.py +++ b/src/lighteval/tasks/extended/mt_bench/main.py @@ -37,7 +37,7 @@ task = LightevalTaskConfig( name="mt_bench", - prompt_function="prompt_fn", # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py + prompt_function="mt_bench_prompt", # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py suite=["extended"], hf_repo="lighteval/mt-bench", hf_subset="default", @@ -51,7 +51,7 @@ ) -def prompt_fn(line, task_name: str = None): +def mt_bench_prompt(line, task_name: str = None): """Defines how to go from a dataset line to a doc object. Follow examples in src/lighteval/tasks/tasks_prompt_formatting.py, or get more info about what this function should do in the README. From af35e8868bba6ce99c0789bb1c5f22501b8b00d2 Mon Sep 17 00:00:00 2001 From: Jan Philipp Harries <2862336+jphme@users.noreply.github.com> Date: Mon, 29 Apr 2024 11:40:36 +0200 Subject: [PATCH 02/25] Fix prompt format german rag community task (#171) Add "Answer: " at the end of the prompt --- community_tasks/german_rag_evals.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/community_tasks/german_rag_evals.py b/community_tasks/german_rag_evals.py index b308314d..55b35440 100644 --- a/community_tasks/german_rag_evals.py +++ b/community_tasks/german_rag_evals.py @@ -110,7 +110,9 @@ def prompt_fn_choose_question_by_context(line, task_name: str = None): A: {choice_a} B: {choice_b} C: {choice_c} -D: {choice_d}""" +D: {choice_d} + +Antwort:""" query = instruction + query_template.format( context=line["context"], choice_a=line["choice_a"], @@ -147,7 +149,9 @@ def prompt_fn_choose_context_by_question(line, task_name: str = None): {choice_c} D: -{choice_d}""" +{choice_d} + +Antwort:""" query = instruction + query_template.format( question=line["question"], choice_a=line["choice_a"], @@ -170,7 +174,9 @@ def prompt_fn_question_answer_match(line, task_name: str = None): query_template = """\ Die Frage: {question} -Die Antwort: {answer}""" +Die Antwort: {answer} + +Auswahl (J/N):""" query = instruction + query_template.format( question=line["question"], answer=line["answer"], @@ -191,7 +197,9 @@ def prompt_fn_context_question_match(line, task_name: str = None): Kontext: {context} -Die Frage: {question}""" +Die Frage: {question} + +Auswahl (J/N):""" query = instruction + query_template.format( question=line["question"], context=line["context"], From 980609328327bc74576a356a80afffb92afbf618 Mon Sep 17 00:00:00 2001 From: Nathan Habib <30601243+NathanHB@users.noreply.github.com> Date: Mon, 29 Apr 2024 14:02:19 +0200 Subject: [PATCH 03/25] add 'cite as' section in readme (#178) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --------- Co-authored-by: Nathan Habib Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> --- README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/README.md b/README.md index e7c54928..6e5a6e79 100644 --- a/README.md +++ b/README.md @@ -443,3 +443,15 @@ srun accelerate launch --multi_gpu --num_processes=8 run_evals_accelerate.py --m pip install build python3 -m build . ``` + +## Cite as + +```bibtex +@misc{lighteval, + author = {Fourrier, Clémentine and Habib, Nathan and Wolf, Thomas and Tunstall, Lewis}, + title = {LightEval: A lightweight framework for LLM evaluation}, + year = {2023}, + version = {0.3.0}, + url = {https://github.com/huggingface/lighteval} +``` + From 0a455c4fd94742495887a6e471ad986ad23bf8bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Fourrier?= <22726840+clefourrier@users.noreply.github.com> Date: Tue, 30 Apr 2024 12:06:18 +0200 Subject: [PATCH 04/25] Add maj@k metric (#158) Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com> * added review change --------- Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com> --- README.md | 3 + src/lighteval/data.py | 3 +- src/lighteval/evaluator.py | 6 +- src/lighteval/logging/info_loggers.py | 5 +- src/lighteval/metrics/__init__.py | 53 ++++++++------- src/lighteval/metrics/metrics.py | 37 +++++++++++ src/lighteval/metrics/metrics_sample.py | 86 +++++++++++++++++++++++++ src/lighteval/metrics/utils.py | 3 +- src/lighteval/models/abstract_model.py | 28 -------- src/lighteval/models/base_model.py | 80 +++++++++++------------ src/lighteval/models/endpoint_model.py | 38 +++-------- src/lighteval/models/nanotron_model.py | 24 ++----- src/lighteval/tasks/lighteval_task.py | 68 ++++++++++--------- src/lighteval/tasks/requests.py | 21 +----- src/lighteval/tasks/tasks_table.jsonl | 16 ++--- tests/test_unit_harness_metrics.py | 12 ++-- 16 files changed, 270 insertions(+), 213 deletions(-) diff --git a/README.md b/README.md index 6e5a6e79..71c66790 100644 --- a/README.md +++ b/README.md @@ -350,6 +350,7 @@ These metrics need the model to generate an output. They are therefore slower. - `f1_score`: Average F1 score in terms of word overlap between the model output and gold without normalisation - `f1_score_macro`: Corpus level macro F1 score - `f1_score_macro`: Corpus level micro F1 score + - `maj_at_5` and `maj_at_8`: Model majority vote. Takes n (5 or 8) generations from the model and assumes the most frequent is the actual prediction. - Summarization: - `rouge` (Harness): Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) - `rouge1` (HELM): Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 1-gram overlap. @@ -376,7 +377,9 @@ These metrics need the model to generate an output. They are therefore slower. - `edit_similarity`: average Levenshtein edit similarity (normalized by length of longer sequence) between model generation and reference. - Math: - `quasi_exact_match_math` (HELM): Fraction of instances where the normalized prediction matches the normalized gold (normalization done for math, where latex symbols, units, etc are removed) + - `maj_at_4_math` (Lighteval): Majority choice evaluation, using the math normalisation for the predictions and gold - `quasi_exact_match_gsm8k` (Harness): Fraction of instances where the normalized prediction matches the normalized gold (normalization done for gsm8k, where latex symbols, units, etc are removed) + - `maj_at_8_gsm8k` (Lighteval): Majority choice evaluation, using the gsm8k normalisation for the predictions and gold ### Metrics for specific tasks To keep compatibility with the Harness for some specific tasks, we ported their evaluations more or less as such. They include `drop` (for the DROP dataset) and `truthfulqa_mc_metrics` (for TruthfulQA). In general, except for tasks where the dataset has a very different formatting than usual (an other language, programming language, math, ...), we want to use standard implementations of the above metrics. It makes little sense to have 10 different versions of an exact match depending on the task. However, most of the above metrics are parametrizable so that you can change the normalization applied easily for experimental purposes. diff --git a/src/lighteval/data.py b/src/lighteval/data.py index 711b0749..247cff04 100644 --- a/src/lighteval/data.py +++ b/src/lighteval/data.py @@ -29,7 +29,6 @@ from lighteval.logging.hierarchical_logger import hlog_warn from lighteval.tasks.requests import ( GreedyUntilRequest, - GreedyUntilWithLogitsRequest, LoglikelihoodRequest, LoglikelihoodRollingRequest, LoglikelihoodSingleTokenRequest, @@ -205,7 +204,7 @@ def _sorting_criteria(self, request: LoglikelihoodSingleTokenRequest) -> int: class GenerativeTaskDataset(DynamicBatchDataset): - def _sorting_criteria(self, request: GreedyUntilRequest | GreedyUntilWithLogitsRequest) -> int: + def _sorting_criteria(self, request: GreedyUntilRequest) -> int: """ Collate function for generating batches. diff --git a/src/lighteval/evaluator.py b/src/lighteval/evaluator.py index c77c3889..e837b922 100644 --- a/src/lighteval/evaluator.py +++ b/src/lighteval/evaluator.py @@ -82,12 +82,10 @@ def evaluate( # noqa: C901 full_resps = lm.loglikelihood(requests, override_bs=override_bs) elif request_type == RequestType.LOGLIKELIHOOD_SINGLE_TOKEN: full_resps = lm.loglikelihood_single_token(requests, override_bs=override_bs) - elif request_type == RequestType.GREEDY_UNTIL: - full_resps = lm.greedy_until(requests, override_bs=override_bs) - elif request_type == RequestType.GREEDY_UNTIL_WITH_LOGITS: - full_resps = lm.greedy_until_with_logits(requests, override_bs=override_bs) elif request_type == RequestType.LOGLIKELIHOOD_ROLLING: full_resps = lm.loglikelihood_rolling(requests, override_bs=override_bs) + elif request_type == RequestType.GREEDY_UNTIL: + full_resps = lm.greedy_until(requests, override_bs=override_bs) elif request_type == RequestType.GREEDY_UNTIL_MULTI_TURN: full_resps = lm.greedy_until_multi_turn(requests, override_bs=override_bs) else: diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py index 83fe981e..c211d2e4 100644 --- a/src/lighteval/logging/info_loggers.py +++ b/src/lighteval/logging/info_loggers.py @@ -350,7 +350,10 @@ def log( ): pred_saved = True pass # should we log something? - if task.has_metric_category[MetricCategory.GENERATIVE]: + if ( + task.has_metric_category[MetricCategory.GENERATIVE] + or task.has_metric_category[MetricCategory.GENERATIVE_SAMPLING] + ): detail.gold = doc.get_golds() pred_saved = True if task.has_metric_category[MetricCategory.GENERATIVE_LOGPROB]: diff --git a/src/lighteval/metrics/__init__.py b/src/lighteval/metrics/__init__.py index 7ef77aef..3d525756 100644 --- a/src/lighteval/metrics/__init__.py +++ b/src/lighteval/metrics/__init__.py @@ -66,16 +66,21 @@ def apply_perplexity_metric(results: list[ModelReturn], formatted_doc: Doc, metr return results, outputs -def apply_generative_metric(results: list[ModelReturn], formatted_doc: Doc, metrics: list[str], output_regex=None): +def apply_generative_metric( + results: list[ModelReturn], formatted_doc: Doc, metrics: list[str], output_regex=None, max_num_samples=1 +): outputs = {} # Post processing prediction - pred_raw = results.pop(0).result - if output_regex is not None: - pred = next(iter(re.findall(output_regex, pred_raw)), "") - else: - pred = pred_raw - pred = as_list(pred) + preds_raw = as_list(results.pop(0).result) + preds = [] + + for pred_raw in preds_raw: + if output_regex is not None: + pred = next(iter(re.findall(output_regex, pred_raw)), "") + else: + pred = pred_raw + preds.append(pred) # Extracting gold try: @@ -87,23 +92,28 @@ def apply_generative_metric(results: list[ModelReturn], formatted_doc: Doc, metr # if "label_to_choices" in formatted_doc: if formatted_doc.specific is not None and "label_to_choices" in formatted_doc.specific: # Helm predicts on labels keys (A/B/C/D), but computes metrics on choices - pred = [formatted_doc.specific["label_to_choices"].get(p) for p in pred] + preds = [formatted_doc.specific["label_to_choices"].get(p) for p in preds] golds = [formatted_doc.specific["label_to_choices"][g] for g in golds] for metric in metrics: if Metrics[metric].value.category == MetricCategory.GENERATIVE: - outputs.update(Metrics[metric].value.compute(golds=golds, predictions=pred, formatted_doc=formatted_doc)) - - return results, outputs - - -def apply_generative_logprob_metric(results: list[ModelReturn], formatted_doc: Doc, metrics: list[str]): - # Applied to no metric atm, but we have the model side logic - outputs = {} - - for metric in metrics: + outputs.update( + Metrics[metric].value.compute( + golds=golds, + predictions=as_list(preds[0]) if max_num_samples > 0 else preds, + formatted_doc=formatted_doc, + ) + ) if Metrics[metric].value.category == MetricCategory.GENERATIVE_LOGPROB: - outputs.update(Metrics[metric].value.compute(results=results, formatted_doc=formatted_doc)) + outputs.update( + Metrics[metric].value.compute( + golds=golds, + predictions=as_list(preds[0]) if max_num_samples > 0 else preds, + formatted_doc=formatted_doc, + ) + ) + if Metrics[metric].value.category == MetricCategory.GENERATIVE_SAMPLING: + outputs.update(Metrics[metric].value.compute(golds=golds, predictions=preds, formatted_doc=formatted_doc)) return results, outputs @@ -153,10 +163,7 @@ def apply_llm_as_judge_metric(results: list[ModelReturn], formatted_doc: Doc, me predictions = results.pop(0).result for metric in metrics: - if ( - Metrics[metric].value.category == MetricCategory.LLM_AS_JUDGE_MULTI_TURN - or Metrics[metric].value.category == MetricCategory.LLM_AS_JUDGE - ): + if Metrics[metric].value.category in [MetricCategory.LLM_AS_JUDGE_MULTI_TURN, MetricCategory.LLM_AS_JUDGE]: outputs.update(Metrics[metric].value.compute(predictions=predictions, formatted_doc=formatted_doc)) return results, outputs diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py index 4a0e367d..07d5c918 100644 --- a/src/lighteval/metrics/metrics.py +++ b/src/lighteval/metrics/metrics.py @@ -41,6 +41,7 @@ F1_score, JudgeLLM, LoglikelihoodAcc, + MajAtK, Recall, StringDistance, acc_golds_likelihood, @@ -326,6 +327,42 @@ class Metrics(Enum): corpus_level_fn=matthews_corrcoef, higher_is_better=True, ) + maj_at_4_math = SampleLevelMetric( + metric="maj@4", + sample_level_fn=MajAtK( + k=4, strip_strings=True, normalize_pred=math_normalizer, normalize_gold=math_normalizer_gold + ).compute, + category=MetricCategory.GENERATIVE_SAMPLING, + use_case=MetricUseCase.MATH, + corpus_level_fn=np.mean, + higher_is_better=True, + ) + maj_at_5 = SampleLevelMetric( + metric="maj@5", + sample_level_fn=MajAtK(k=5).compute, + category=MetricCategory.GENERATIVE_SAMPLING, + use_case=MetricUseCase.ACCURACY, + corpus_level_fn=np.mean, + higher_is_better=True, + ) + maj_at_8 = SampleLevelMetric( + metric="maj@8", + sample_level_fn=MajAtK(k=8).compute, + category=MetricCategory.GENERATIVE_SAMPLING, + use_case=MetricUseCase.ACCURACY, + corpus_level_fn=np.mean, + higher_is_better=True, + ) + maj_at_8_gsm8k = SampleLevelMetric( + metric="maj@8", + sample_level_fn=MajAtK( + k=8, strip_strings=True, normalize_pred=gsm8k_normalizer, normalize_gold=gsm8k_normalizer + ).compute, + category=MetricCategory.GENERATIVE_SAMPLING, + use_case=MetricUseCase.MATH, + corpus_level_fn=np.mean, + higher_is_better=True, + ) mrr = SampleLevelMetric( metric="mrr", sample_level_fn=MRR().compute, diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index a3809adb..37b922da 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -675,3 +675,89 @@ def compute(self, predictions: list[str], formatted_doc: Doc, **kwargs) -> dict[ "user_prompt": messages[0], "judgement": judgements[0], } + + +class MajAtK: + def __init__( + self, + k: int, + normalize_gold: callable = None, + normalize_pred: callable = None, + strip_strings: bool = False, + type_exact_match: str = "full", + ): + """An exact match class. + + Args: + normalize_gold (callable, optional): Function to use to normalize the reference strings. + Defaults to None if no normalization is applied. + normalize_pred (callable, optional): Function to use to normalize the predicted strings. + Defaults to None if no normalization is applied. + strip_strings (bool, optional): Whether to strip both reference and predictions. Defaults to False. + type_exact_match (str, optional): Defines what type of match to apply (post normalization if present). + Can be any of `prefix`, `suffix` or `full`. Defaults to "full". + `prefix` checks if the prediction starts with the gold, + `suffix` if the prediction ends with the gold, + `full` if the prediction and gold are equal + """ + self.k = k + self.normalize_gold = normalize_gold + self.normalize_pred = normalize_pred + self.strip_strings = strip_strings + + if type_exact_match not in ["prefix", "suffix", "full"]: + # todo: we could add a set exact match + raise ValueError( + f"type_exact_match (used in parametrized_exact_match) must be one of prefix, suffix, or full. Was {type_exact_match} instead." + ) + self.type_exact_match = type_exact_match + + def compute(self, golds: list[str], predictions: list[str], **kwargs) -> dict[str, float]: + """Computes the metric over a list of golds and predictions for one single sample. + It applies normalisation (if needed) to model prediction and gold, and takes the most frequent answer of all the available ones, + then compares it to the gold. + + Args: + golds (list[str]): Reference targets + predictions (list[str]): k predicted strings + + Returns: + float: Aggregated score over the current sample's items. + """ + if len(golds) > 1: + raise Exception("Cannot compute maj@k with several golds") + + gold = self.get_processed_gold(golds[0]) + all_answers = [] + for pred in predictions[: self.k]: + all_answers.append(self.get_processed_pred(pred=pred)) + majority_prediction = max(all_answers, key=all_answers.count) + return self.compute_score(majority_prediction, gold) + + def get_processed_gold(self, gold: str) -> float: + if self.strip_strings: + gold = gold.strip() + + if self.normalize_gold: + gold = self.normalize_gold(gold) + + return gold + + def get_processed_pred(self, pred: str) -> float: + if not pred: + return "" + + if self.strip_strings: + pred = pred.strip() + + if self.normalize_pred: + pred = self.normalize_pred(pred) + + return pred + + def compute_score(self, pred: str, gold: str) -> int: + if self.type_exact_match == "prefix": + return 1 if pred.startswith(gold) else 0 + if self.type_exact_match == "suffix": + return 1 if pred.endswith(gold) else 0 + return 1 if gold == pred else 0 diff --git a/src/lighteval/metrics/utils.py b/src/lighteval/metrics/utils.py index 6c79871e..e5ceaeb0 100644 --- a/src/lighteval/metrics/utils.py +++ b/src/lighteval/metrics/utils.py @@ -28,9 +28,10 @@ class MetricCategory(Enum): TARGET_PERPLEXITY = auto() PERPLEXITY = auto() GENERATIVE = auto() + GENERATIVE_LOGPROB = auto() + GENERATIVE_SAMPLING = auto() LLM_AS_JUDGE_MULTI_TURN = auto() LLM_AS_JUDGE = auto() - GENERATIVE_LOGPROB = auto() MULTICHOICE = auto() MULTICHOICE_ONE_TOKEN = auto() IGNORED = auto() diff --git a/src/lighteval/models/abstract_model.py b/src/lighteval/models/abstract_model.py index ccc49146..754a6144 100644 --- a/src/lighteval/models/abstract_model.py +++ b/src/lighteval/models/abstract_model.py @@ -36,7 +36,6 @@ from lighteval.tasks.requests import ( GreedyUntilMultiTurnRequest, GreedyUntilRequest, - GreedyUntilWithLogitsRequest, LoglikelihoodRequest, LoglikelihoodRollingRequest, LoglikelihoodSingleTokenRequest, @@ -83,31 +82,6 @@ def max_length(self) -> int: def disable_tqdm(self) -> bool: raise NotImplementedError - def greedy_until_with_logits( - self, - requests: list[GreedyUntilWithLogitsRequest], - override_bs: Optional[int] = None, - ) -> list[GenerateReturn]: - """ - Generates sequences greedily until a stopping condition is met, - returning both the generated sequences and the logits. - - Args: - requests (list[tuple[str, dict]]): A list of input requests, - where each request is a tuple containing a prompt string and a dictionary of additional parameters. - disable_tqdm (bool, optional): Whether to disable the tqdm progress bar. Defaults to False. - override_bs (Optional[int], optional): Overrides the batch size for generation. Defaults to None. - - Returns: - list[GenerateReturn]: A list of GenerateReturn objects, - where each object contains the generated sequence and the corresponding logits. - """ - return self.greedy_until( - requests=requests, - override_bs=override_bs, - returns_logits=True, - ) - def greedy_until_multi_turn( # noqa: C901 self, requests: list[GreedyUntilMultiTurnRequest], override_bs: Optional[int] = None ) -> GenerateMultiTurnReturn: @@ -118,7 +92,6 @@ def greedy_until_multi_turn( # noqa: C901 def greedy_until( self, requests: list[GreedyUntilRequest], - returns_logits: bool = False, override_bs: Optional[int] = None, ) -> list[GenerateReturn]: """ @@ -126,7 +99,6 @@ def greedy_until( Args: requests (list[Request]): list of requests containing the context and ending conditions. - returns_logits (bool, optional): Whether to return the logits of the generated responses. Defaults to False. disable_tqdm (bool, optional): Whether to disable the progress bar. Defaults to False. override_bs (int, optional): Override the batch size for generation. Defaults to None. diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py index 5dbaa750..7d9bd8d2 100644 --- a/src/lighteval/models/base_model.py +++ b/src/lighteval/models/base_model.py @@ -46,7 +46,6 @@ from lighteval.tasks.requests import ( GreedyUntilMultiTurnRequest, GreedyUntilRequest, - GreedyUntilWithLogitsRequest, LoglikelihoodRequest, LoglikelihoodRollingRequest, LoglikelihoodSingleTokenRequest, @@ -326,32 +325,6 @@ def forward_batch(batch_size): hlog(f"Determined largest batch size: {batch_size}") return batch_size - def greedy_until_with_logits( - self, - requests: list[GreedyUntilWithLogitsRequest], - override_bs: Optional[int] = None, - ) -> list[GenerateReturn]: - """ - Generates sequences greedily until a stopping condition is met, - returning both the generated sequences and the logits. - - Args: - requests (list[tuple[str, dict]]): A list of input requests, - where each request is a tuple containing a prompt string and a dictionary of additional parameters. - override_bs (Optional[int], optional): Overrides the batch size for generation. Defaults to None. - - Returns: - list[GenerateReturn]: A list of GenerateReturn objects, - where each object contains the generated sequence and the corresponding logits. - """ - - return self.greedy_until( - requests, - returns_logits=True, - disable_tqdm=self.disable_tqdm, - override_bs=override_bs, - ) - def greedy_until_multi_turn( # noqa: C901 self, requests: list[GreedyUntilMultiTurnRequest], override_bs: Optional[int] = None ) -> GenerateMultiTurnReturn: @@ -486,7 +459,6 @@ def greedy_until_multi_turn( # noqa: C901 def greedy_until( self, requests: list[GreedyUntilRequest], - returns_logits: bool = False, override_bs: Optional[int] = None, ) -> list[GenerateReturn]: """ @@ -494,7 +466,6 @@ def greedy_until( Args: requests (list[Request]): list of requests containing the context and ending conditions. - returns_logits (bool, optional): Whether to return the logits of the generated responses. Defaults to False. override_bs (int, optional): Override the batch size for generation. Defaults to None. Returns: @@ -542,10 +513,12 @@ def greedy_until( dataloader, desc="Greedy generation", position=1, leave=False, disable=self.disable_tqdm ): # NOTE: we are assuming all items in a batch behave similarly (same - # stop_tokens and max_tokens genrated) which is not necessarily + # stop_tokens and max_tokens generated) which is not necessarily # the case! Because of that we only use batch size of 1 stop_tokens = batch[0].stop_sequence max_new_tokens = batch[0].generation_size + returns_logits = batch[0].use_logits + num_samples = batch[0].num_samples # The main question for this step is the following: # Would we rather truncate the prompt to allow generation to go to max_new_tokens, at the risk @@ -596,6 +569,7 @@ def greedy_until( max_new_tokens=max_new_tokens, stop_tokens=stop_tokens, returns_logits=returns_logits, + num_samples=num_samples, ) results.extend(cur_reponses) @@ -607,11 +581,13 @@ def _generate( max_new_tokens: int, stop_tokens: list[str], returns_logits: Optional[bool] = False, + num_samples: Optional[int] = 1, ) -> list[GenerateReturn]: """Contains the actual logic of the generation. First computes the stop sequences, then generates the predictions, then converts the outputs to GenerateReturn. """ stopping_criteria = stop_sequences_criteria(self.tokenizer, stop_sequences=stop_tokens, batch=batch) + batch_size, _ = batch.input_ids.shape # Compute model generation outputs = self.model.generate( @@ -619,16 +595,18 @@ def _generate( attention_mask=batch.input_mask, max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria, - do_sample=False, pad_token_id=self.tokenizer.pad_token_id if self.tokenizer.pad_token_id else self.tokenizer.eos_token_id, return_dict_in_generate=True, output_scores=True, eos_token_id=self.tokenizer.eos_token_id, + do_sample=num_samples > 1, + num_return_sequences=num_samples, ) if returns_logits: logits = self.model.compute_transition_scores(outputs.sequences, outputs.scores, normalize_logits=True) generations = outputs.sequences[:, batch.input_ids.size(1) :] - generations, len_gens = self.pad_and_gather(generations) + generations = torch.reshape(generations, (batch_size, num_samples, -1)) + generations, len_gens = self.pad_and_gather(generations, num_samples=num_samples) batch.input_ids, len_ids = self.pad_and_gather(batch.input_ids) logits, len_logits = None, None @@ -646,20 +624,30 @@ def _generate( # We convert to GenerateReturn outputs all_responses = [] - for ix, (generation, batched_input, trunc, padded) in enumerate( + for ix, (batched_generations, batched_input, trunc, padded) in enumerate( zip(generations, batch.input_ids, batch.truncated, batch.padded) ): + result_generations = [] + decoded_generations = [] # Ensure the generated responses do not contain the stop sequences. - generation = generation[: len_gens[ix]] - decoded_generation = self.tok_decode([generation])[0] + for generation in batched_generations: + generation = generation[: len_gens[ix]] + result_generations.append(generation) + decoded_generation = self.tok_decode([generation])[0] - for term in stop_tokens: - decoded_generation = decoded_generation.split(term)[0] + for term in stop_tokens: + decoded_generation = decoded_generation.split(term)[0] + + decoded_generations.append(decoded_generation) + + if num_samples == 1: # We only return one item + result_generations = result_generations[0] + decoded_generations = decoded_generations[0] cur_response = GenerateReturn( - result=decoded_generation, + result=decoded_generations, logits=logits[ix][: len_logits[ix]] if returns_logits else None, - generated_tokens=generation, + generated_tokens=result_generations, input_tokens=batched_input[: len_ids[ix]], truncated_tokens_count=trunc.cpu().item(), padded_tokens_count=padded.cpu().item(), @@ -891,7 +879,9 @@ def prepare_batch_logprob( padded=padded, ) - def pad_and_gather(self, output_tensor: torch.Tensor, drop_last_samples: bool = True) -> torch.Tensor: + def pad_and_gather( + self, output_tensor: torch.Tensor, drop_last_samples: bool = True, num_samples: int = None + ) -> torch.Tensor: """ Pads the `output_tensor` to the maximum length and gathers the lengths across processes. @@ -905,15 +895,19 @@ def pad_and_gather(self, output_tensor: torch.Tensor, drop_last_samples: bool = torch.Tensor: The padded output tensor and the gathered length tensor. """ # Create a tensor of size batch_size, [output_length] * batch_size, for each process - length_tensor = torch.tensor([output_tensor.shape[1]] * output_tensor.shape[0], device=self.device) + # output_tensor can be of size: batch_size * num_samples * length_item or just batch_size * length_item + length_tensor = torch.tensor([output_tensor.shape[-1]] * output_tensor.shape[0], device=self.device) if self.accelerator is not None: # Gather all the lengths, we end up with a tensor of size num_processes [output_length_1, output_length_2, ...] length_tensor = self.accelerator.gather(length_tensor) # We pad the output_tensor to the max length max_length = length_tensor.max().item() - output_tensor = F.pad( - output_tensor, (0, max_length - output_tensor.shape[1], 0, 0), value=self.tokenizer.pad_token_id + padding = ( + (0, max_length - output_tensor.shape[-1], 0, 0, 0, 0) + if num_samples is not None + else (0, max_length - output_tensor.shape[-1], 0, 0) ) + output_tensor = F.pad(output_tensor, padding, value=self.tokenizer.pad_token_id) if self.accelerator: if drop_last_samples: output_tensor = self.accelerator.gather_for_metrics(output_tensor) diff --git a/src/lighteval/models/endpoint_model.py b/src/lighteval/models/endpoint_model.py index b118a93b..d79e0f91 100644 --- a/src/lighteval/models/endpoint_model.py +++ b/src/lighteval/models/endpoint_model.py @@ -44,7 +44,6 @@ from lighteval.models.model_output import GenerateReturn, LoglikelihoodReturn, LoglikelihoodSingleTokenReturn from lighteval.tasks.requests import ( GreedyUntilRequest, - GreedyUntilWithLogitsRequest, LoglikelihoodRequest, LoglikelihoodRollingRequest, LoglikelihoodSingleTokenRequest, @@ -182,7 +181,7 @@ def __process_request(self, context: str, stop_tokens: list[str], max_tokens: in async def __async_process_batch_generate( self, - requests: list[GreedyUntilRequest | GreedyUntilWithLogitsRequest], + requests: list[GreedyUntilRequest], ) -> list[TextGenerationOutput]: return await asyncio.gather( *[ @@ -197,7 +196,7 @@ async def __async_process_batch_generate( def __process_batch_generate( self, - requests: list[GreedyUntilRequest | GreedyUntilWithLogitsRequest], + requests: list[GreedyUntilRequest], ) -> list[TextGenerationOutput]: return [ self.__process_request( @@ -234,35 +233,9 @@ def __process_batch_logprob( for request in requests ] - def greedy_until_with_logits( - self, - requests: list[GreedyUntilWithLogitsRequest], - override_bs: Optional[int] = None, - ) -> list[GenerateReturn]: - """ - Generates sequences greedily until a stopping condition is met, - returning both the generated sequences and the logits. - - Args: - requests (list[tuple[str, dict]]): A list of input requests, - where each request is a tuple containing a prompt string and a dictionary of additional parameters. - override_bs (Optional[int], optional): Overrides the batch size for generation. Defaults to None. - - Returns: - list[GenerateReturn]: A list of GenerateReturn objects, - where each object contains the generated sequence and the corresponding logits. - """ - - return self.greedy_until( - requests, - returns_logits=True, - override_bs=override_bs, - ) - def greedy_until( self, requests: List[GreedyUntilRequest], - returns_logits: bool = False, override_bs: Optional[int] = None, ) -> List[GenerateReturn]: for request in requests: @@ -286,6 +259,13 @@ def greedy_until( dataloader, desc="Greedy generation", position=1, leave=False, disable=self.disable_tqdm ): # the `returns_logits` flag is only used to filter the results, we always request the full details. + returns_logits = batch[0].use_logits + num_samples = batch[0].num_samples + if num_samples > 1: + hlog_err( + "Inference endpoints does not allow sampling evaluations - this is likely to fail or provide problematic results" + ) + if self.use_async: responses = asyncio.run(self.__async_process_batch_generate(batch)) else: diff --git a/src/lighteval/models/nanotron_model.py b/src/lighteval/models/nanotron_model.py index 69ad420f..977b2b19 100644 --- a/src/lighteval/models/nanotron_model.py +++ b/src/lighteval/models/nanotron_model.py @@ -54,7 +54,7 @@ LoglikelihoodDataset, LoglikelihoodSingleTokenDataset, ) -from lighteval.logging.hierarchical_logger import hlog_warn +from lighteval.logging.hierarchical_logger import hlog_err, hlog_warn from lighteval.models.base_model import LightevalModel from lighteval.models.model_config import EnvConfig from lighteval.models.model_output import Batch, GenerateReturn, LoglikelihoodReturn, LoglikelihoodSingleTokenReturn @@ -351,21 +351,6 @@ def tok_decode(self, tokens: torch.LongTensor) -> List[str]: def _model_call(self, inputs: torch.Tensor) -> torch.Tensor: return self.model(inputs) - def greedy_until_with_logits( - self, - requests: list[tuple[str, dict]], - disable_tqdm: bool = False, - override_bs=None, - dataset_splits: int = 4, - ) -> list[GenerateReturn]: - return self.greedy_until( - requests, - returns_logits=True, - disable_tqdm=disable_tqdm, - override_bs=override_bs, - dataset_splits=dataset_splits, - ) - def _encode_pair(self, context, continuation): n_spaces = len(context) - len(context.rstrip()) if n_spaces > 0: @@ -1130,7 +1115,6 @@ def _loglikelihood_tokens( def greedy_until( self, requests: List[GreedyUntilRequest], - returns_logits=False, disable_tqdm: bool = False, override_bs=None, dataset_splits: int = 1, @@ -1216,6 +1200,12 @@ def greedy_until( # the maximum allowed generation size for the batch, unless we want to force truncation # need to pass them somewhere ! stop_tokens = batch[0].stop_sequence max_new_tokens = batch[0].generation_size + returns_logits = batch[0].use_logits + num_samples = batch[0].num_samples + if num_samples > 1: + hlog_err( + "Nonotron models does not allow sampling evaluations - this is likely to fail or provide problematic results" + ) # The main question for this step is the following: # Would we rather truncate the prompt to allow generation to go to max_new_tokens, at the risk diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py index 64ba9f39..f9df6fdd 100644 --- a/src/lighteval/tasks/lighteval_task.py +++ b/src/lighteval/tasks/lighteval_task.py @@ -32,7 +32,6 @@ from lighteval.few_shot_manager import FewShotSampler from lighteval.logging.hierarchical_logger import hlog, hlog_warn from lighteval.metrics import ( - apply_generative_logprob_metric, apply_generative_metric, apply_llm_as_judge_metric, apply_multichoice_metric, @@ -47,7 +46,6 @@ Doc, GreedyUntilMultiTurnRequest, GreedyUntilRequest, - GreedyUntilWithLogitsRequest, LoglikelihoodRequest, LoglikelihoodRollingRequest, LoglikelihoodSingleTokenRequest, @@ -101,6 +99,7 @@ class LightevalTaskConfig: generation_size: int = None stop_sequence: Optional[Tuple[str]] = None output_regex: Optional[str] = None + num_samples: Optional[list[int]] = None frozen: bool = False suite: Optional[Tuple[str]] = None @@ -201,6 +200,11 @@ def __init__( # noqa: C901 hlog_warn(f"[WARNING] Not implemented yet: ignoring the metric {' ,'.join(ignored)} for task {self.name}.") current_categories = [Metrics[metric].value.category for metric in self.metrics] self.has_metric_category = {category: (category in current_categories) for category in MetricCategory} + # Sub-optimal system - we might want to store metric parametrisation in a yaml conf for example + # We assume num_samples always contains 1 (for base generative evals) + self.num_samples = [1] + [ + int(metric.replace("maj_at_", "").split("_")[0]) for metric in self.metrics if "maj_at_" in metric + ] # Data processing # to use once prompt formatting is managed as a module @@ -394,7 +398,7 @@ def doc_to_target(self, formatted_doc: Doc, few_shot: bool = False) -> str: return as_list(formatted_doc.get_golds(few_shot=few_shot))[0] # Requests - def get_request_type(self) -> list[RequestType]: + def get_request_type(self) -> list[RequestType]: # noqa C901 """ Returns the request types for the task. @@ -408,25 +412,27 @@ def get_request_type(self) -> list[RequestType]: request_types = [] if self.has_metric_category[MetricCategory.TARGET_PERPLEXITY]: request_types.append(RequestType.LOGLIKELIHOOD) + if self.has_metric_category[MetricCategory.MULTICHOICE]: + request_types.append(RequestType.LOGLIKELIHOOD) + if self.has_metric_category[MetricCategory.MULTICHOICE_ONE_TOKEN]: + request_types.append(RequestType.LOGLIKELIHOOD_SINGLE_TOKEN) if self.has_metric_category[MetricCategory.PERPLEXITY]: request_types.append(RequestType.LOGLIKELIHOOD_ROLLING) if self.has_metric_category[MetricCategory.GENERATIVE]: request_types.append(RequestType.GREEDY_UNTIL) - if self.has_metric_category[MetricCategory.LLM_AS_JUDGE_MULTI_TURN]: - request_types.append(RequestType.GREEDY_UNTIL_MULTI_TURN) + if self.has_metric_category[MetricCategory.GENERATIVE_LOGPROB]: + request_types.append(RequestType.GREEDY_UNTIL) + if self.has_metric_category[MetricCategory.GENERATIVE_SAMPLING]: + request_types.append(RequestType.GREEDY_UNTIL) if self.has_metric_category[MetricCategory.LLM_AS_JUDGE]: request_types.append(RequestType.GREEDY_UNTIL) - if self.has_metric_category[MetricCategory.GENERATIVE_LOGPROB]: - request_types.append(RequestType.GREEDY_UNTIL_WITH_LOGITS) - if self.has_metric_category[MetricCategory.MULTICHOICE]: - request_types.append(RequestType.LOGLIKELIHOOD) - if self.has_metric_category[MetricCategory.MULTICHOICE_ONE_TOKEN]: - request_types.append(RequestType.LOGLIKELIHOOD_SINGLE_TOKEN) + if self.has_metric_category[MetricCategory.LLM_AS_JUDGE_MULTI_TURN]: + request_types.append(RequestType.GREEDY_UNTIL_MULTI_TURN) if len(request_types) == 0: raise NotImplementedError(f"Request type not implemented for task {self.name}") - return request_types + return list(set(request_types)) def construct_requests( self, formatted_doc: Doc, context: str, document_id_seed: str, current_task_name: str @@ -463,7 +469,13 @@ def construct_requests( task_name=current_task_name, example_index=document_id_seed, request_index=0, context=context ) ] - if self.has_metric_category[MetricCategory.GENERATIVE]: + if ( + self.has_metric_category[MetricCategory.GENERATIVE_SAMPLING] + or self.has_metric_category[MetricCategory.GENERATIVE] + or self.has_metric_category[MetricCategory.GENERATIVE_LOGPROB] + ): + # All these tasks require the same generation process - we can do them in one step + use_logits = self.has_metric_category[MetricCategory.GENERATIVE_LOGPROB] requests[RequestType.GREEDY_UNTIL] += [ GreedyUntilRequest( task_name=current_task_name, @@ -472,17 +484,8 @@ def construct_requests( context=context, stop_sequence=self.stop_sequence, generation_size=self.generation_size, - ) - ] - if self.has_metric_category[MetricCategory.GENERATIVE_LOGPROB]: - requests[RequestType.GREEDY_UNTIL_WITH_LOGITS] += [ - GreedyUntilWithLogitsRequest( - task_name=current_task_name, - example_index=document_id_seed, - request_index=0, - context=context, - stop_sequence=self.stop_sequence, - generation_size=self.generation_size, + num_samples=max(self.num_samples), # If we have several samplings to apply, we use the max + use_logits=use_logits, ) ] if self.has_metric_category[MetricCategory.MULTICHOICE]: @@ -543,14 +546,17 @@ def process_results(self, formatted_doc: Doc, results: list[ModelReturn]) -> dic results=results, formatted_doc=formatted_doc, metrics=self.metrics ) outputs.update(cur_outputs) - if self.has_metric_category[MetricCategory.GENERATIVE]: + if ( + self.has_metric_category[MetricCategory.GENERATIVE] + or self.has_metric_category[MetricCategory.GENERATIVE_SAMPLING] + or self.has_metric_category[MetricCategory.GENERATIVE_LOGPROB] + ): results, cur_outputs = apply_generative_metric( - results=results, formatted_doc=formatted_doc, metrics=self.metrics, output_regex=self.output_regex - ) - outputs.update(cur_outputs) - if self.has_metric_category[MetricCategory.GENERATIVE_LOGPROB]: - results, cur_outputs = apply_generative_logprob_metric( - results=results, formatted_doc=formatted_doc, metrics=self.metrics + results=results, + formatted_doc=formatted_doc, + metrics=self.metrics, + output_regex=self.output_regex, + max_num_samples=max(self.num_samples), ) outputs.update(cur_outputs) if self.has_metric_category[MetricCategory.MULTICHOICE]: diff --git a/src/lighteval/tasks/requests.py b/src/lighteval/tasks/requests.py index c4c86335..51abf61d 100644 --- a/src/lighteval/tasks/requests.py +++ b/src/lighteval/tasks/requests.py @@ -34,7 +34,6 @@ class RequestType(Enum): LOGLIKELIHOOD_ROLLING = auto() GREEDY_UNTIL = auto() GREEDY_UNTIL_MULTI_TURN = auto() - GREEDY_UNTIL_WITH_LOGITS = auto() @dataclass @@ -119,6 +118,8 @@ class GreedyUntilRequest(Request): generation_size: int request_type = RequestType.GREEDY_UNTIL tokenized_context: list[int] = None + num_samples: int = None + use_logits: bool = False @dataclass @@ -137,24 +138,6 @@ class GreedyUntilMultiTurnRequest(Request): request_type = RequestType.GREEDY_UNTIL_MULTI_TURN -@dataclass -class GreedyUntilWithLogitsRequest(Request): - """ - Represents a request for generating text using the Greedy-Until strategy but - returning the logits. - - Attributes: - stop_sequence (str): The sequence of tokens that indicates when to stop generating text. - generation_size (int): The maximum number of tokens to generate. - request_type (RequestType): The type of the request (GREEDY_UNTIL_WITH_LOGITS). - """ - - stop_sequence: Union[str, tuple[str], list[str]] - generation_size: int - request_type = RequestType.GREEDY_UNTIL_WITH_LOGITS - tokenized_context: list[int] = None - - class TaskExampleId(NamedTuple): """ Represents the identifier for an example in a task. diff --git a/src/lighteval/tasks/tasks_table.jsonl b/src/lighteval/tasks/tasks_table.jsonl index 12e70f38..ecaeff04 100644 --- a/src/lighteval/tasks/tasks_table.jsonl +++ b/src/lighteval/tasks/tasks_table.jsonl @@ -442,7 +442,7 @@ {"name":"gpqa","suite":["lighteval"],"prompt_function":"gpqa","hf_repo":"Idavidrein/gpqa","hf_subset":"gpqa_main","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"gre_reading_comprehension","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"gre_reading_comprehension","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"gsm8k","suite":["leaderboard"],"prompt_function":"gsm8k","hf_repo":"gsm8k","hf_subset":"main","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling_from_train","generation_size":256,"metric":["quasi_exact_match_gsm8k"],"stop_sequence":["Question:","Question",":"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"gsm8k","suite":["lighteval"],"prompt_function":"gsm8k","hf_repo":"gsm8k","hf_subset":"main","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling_from_train","generation_size":256,"metric":["quasi_exact_match_gsm8k"],"stop_sequence":["Question:"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"gsm8k","suite":["lighteval"],"prompt_function":"gsm8k","hf_repo":"gsm8k","hf_subset":"main","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling_from_train","generation_size":256,"metric":["quasi_exact_match_gsm8k","maj_at_8_gsm8k"],"stop_sequence":["Question:"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"headqa:en","suite":["lighteval","headqa"],"prompt_function":"headqa","hf_repo":"lighteval/headqa_harness","hf_subset":"en","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"headqa:es","suite":["lighteval","headqa"],"prompt_function":"headqa","hf_repo":"lighteval/headqa_harness","hf_subset":"es","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"hellaswag","suite":["leaderboard"],"prompt_function":"hellaswag_harness","hf_repo":"hellaswag","hf_subset":"default","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":"random_sampling_from_train","generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} @@ -540,13 +540,13 @@ {"name":"lsat_qa:grouping","suite":["helm","lsat_qa_scenario"],"prompt_function":"lsat_qa","hf_repo":"lighteval\/lsat_qa","hf_subset":"grouping","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"lsat_qa:miscellaneous","suite":["helm","lsat_qa_scenario"],"prompt_function":"lsat_qa","hf_repo":"lighteval\/lsat_qa","hf_subset":"miscellaneous","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"lsat_qa:ordering","suite":["helm","lsat_qa_scenario"],"prompt_function":"lsat_qa","hf_repo":"lighteval\/lsat_qa","hf_subset":"ordering","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"math:algebra","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"algebra","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"math:counting_and_probability","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"counting_and_probability","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"math:geometry","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"geometry","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"math:intermediate_algebra","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"intermediate_algebra","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"math:number_theory","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"number_theory","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"math:prealgebra","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"prealgebra","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"math:precalculus","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"precalculus","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"math:algebra","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"algebra","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"math:counting_and_probability","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"counting_and_probability","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"math:geometry","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"geometry","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"math:intermediate_algebra","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"intermediate_algebra","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"math:number_theory","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"number_theory","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"math:prealgebra","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"prealgebra","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"math:precalculus","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"precalculus","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mathematical_induction","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"mathematical_induction","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mathqa","suite":["lighteval"],"prompt_function":"mathqa","hf_repo":"math_qa","hf_subset":"default","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"matrixshapes","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"matrixshapes","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} diff --git a/tests/test_unit_harness_metrics.py b/tests/test_unit_harness_metrics.py index 35f6634f..d8a6503a 100644 --- a/tests/test_unit_harness_metrics.py +++ b/tests/test_unit_harness_metrics.py @@ -26,7 +26,6 @@ import pytest from lighteval.metrics import ( - apply_generative_logprob_metric, apply_generative_metric, apply_multichoice_metric, apply_multichoice_metric_one_token, @@ -129,14 +128,13 @@ def apply_metric(metric, results, formatted_doc: Doc): if Metrics[metric].value.category == MetricCategory.PERPLEXITY: _, cur_outputs = apply_perplexity_metric(results=results, formatted_doc=formatted_doc, metrics=[metric]) return cur_outputs - if Metrics[metric].value.category == MetricCategory.GENERATIVE: + if Metrics[metric].value.category in [ + MetricCategory.GENERATIVE, + MetricCategory.GENERATIVE_LOGPROB, + MetricCategory.GENERATIVE_SAMPLING, + ]: _, cur_outputs = apply_generative_metric(results=results, formatted_doc=formatted_doc, metrics=[metric]) return cur_outputs - if Metrics[metric].value.category == MetricCategory.GENERATIVE_LOGPROB: - _, cur_outputs = apply_generative_logprob_metric( - results=results, formatted_doc=formatted_doc, metrics=[metric] - ) - return cur_outputs if Metrics[metric].value.category == MetricCategory.MULTICHOICE: _, cur_outputs = apply_multichoice_metric(results=results, formatted_doc=formatted_doc, metrics=[metric]) return cur_outputs From 6183bf224246383f11f149af2286bfd40b748a0c Mon Sep 17 00:00:00 2001 From: Nathan Habib <30601243+NathanHB@users.noreply.github.com> Date: Tue, 30 Apr 2024 16:17:24 +0200 Subject: [PATCH 05/25] Fix citation section in readme (#180) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add 'cite as' section in readme * add 'cite as' section in readme * fix citation --------- Co-authored-by: Nathan Habib Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 71c66790..a056d7d3 100644 --- a/README.md +++ b/README.md @@ -456,5 +456,6 @@ python3 -m build . year = {2023}, version = {0.3.0}, url = {https://github.com/huggingface/lighteval} +} ``` From 979359dbbc504603b044f8ebfd1b525d182683d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20Rodr=C3=ADguez=20Salamanca?= Date: Fri, 3 May 2024 11:04:49 +0200 Subject: [PATCH 06/25] Fix broken link to extended tasks in README (#182) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a056d7d3..fe9748c3 100644 --- a/README.md +++ b/README.md @@ -186,7 +186,7 @@ python run_evals_accelerate.py \ Independently of the default tasks provided in `lighteval` that you will find in the `tasks_table.jsonl` file, you can use `lighteval` to evaluate models on tasks that require special processing (or have been added by the community). These tasks have their own evaluation suites and are defined as follows: -* `extended`: tasks which have complex pre- or post-processing and are added by the `lighteval` maintainers. See the [`extended_tasks`](./src/lighteval/tasks/extended_tasks) folder for examples. +* `extended`: tasks which have complex pre- or post-processing and are added by the `lighteval` maintainers. See the [`extended`](./src/lighteval/tasks/extended) folder for examples. * `community`: tasks which have been added by the community. See the [`community_tasks`](./community_tasks) folder for examples. * `custom`: tasks which are defined locally and not present in the core library. Use this suite if you want to experiment with designing a special metric or task. From c89b386e32950f185435bb135db67f640e14cff2 Mon Sep 17 00:00:00 2001 From: Philip May Date: Fri, 3 May 2024 14:14:36 +0200 Subject: [PATCH 07/25] Add version config option. (#181) * add version option to LightevalTaskConfig and LightevalTask * increase version for german rag evals to 1 * improve doc * change VERSION to lowercase * set version to 0 in tasks_table.jsonl * add version to arabic_evals * handle TODO comments --- community_tasks/arabic_evals.py | 15 + community_tasks/german_rag_evals.py | 4 + src/lighteval/tasks/lighteval_task.py | 8 +- src/lighteval/tasks/tasks_table.jsonl | 2456 ++++++++++++------------- 4 files changed, 1253 insertions(+), 1230 deletions(-) diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py index 60db0450..6a0af9f5 100644 --- a/community_tasks/arabic_evals.py +++ b/community_tasks/arabic_evals.py @@ -75,6 +75,7 @@ def __init__( output_regex=None, frozen=False, trust_dataset=True, + version=0, ) @@ -142,6 +143,7 @@ def __init__( output_regex=None, frozen=False, trust_dataset=True, + version=0, ) @@ -173,6 +175,7 @@ def acva(line, task_name: str = None): few_shots_select="sequential", metric=["loglikelihood_acc_norm"], trust_dataset=True, + version=0, ) @@ -230,6 +233,7 @@ def __init__( stop_sequence=None, output_regex=None, frozen=False, + version=0, ) @@ -273,6 +277,7 @@ def alghafa_prompt(line, task_name: str = None): few_shots_select="sequential", metric=["loglikelihood_acc_norm"], trust_dataset=True, + version=0, ) @@ -289,6 +294,7 @@ def alghafa_prompt(line, task_name: str = None): few_shots_select="sequential", metric=["loglikelihood_acc_norm"], trust_dataset=True, + version=0, ) @@ -305,6 +311,7 @@ def alghafa_prompt(line, task_name: str = None): few_shots_select="sequential", metric=["loglikelihood_acc_norm"], trust_dataset=True, + version=0, ) @@ -321,6 +328,7 @@ def alghafa_prompt(line, task_name: str = None): few_shots_select="sequential", metric=["loglikelihood_acc_norm"], trust_dataset=True, + version=0, ) @@ -337,6 +345,7 @@ def alghafa_prompt(line, task_name: str = None): few_shots_select="sequential", metric=["loglikelihood_acc_norm"], trust_dataset=True, + version=0, ) @@ -353,6 +362,7 @@ def alghafa_prompt(line, task_name: str = None): few_shots_select="sequential", metric=["loglikelihood_acc_norm"], trust_dataset=True, + version=0, ) @@ -369,6 +379,7 @@ def alghafa_prompt(line, task_name: str = None): few_shots_select="sequential", metric=["loglikelihood_acc_norm"], trust_dataset=True, + version=0, ) @@ -408,6 +419,7 @@ def boolq_prompt_arabic(line, task_name: str = None): few_shots_select="sequential", metric=["loglikelihood_acc_norm"], trust_dataset=True, + version=0, ) @@ -443,6 +455,7 @@ def copa_prompt_arabic(line, task_name: str = None): few_shots_select="sequential", metric=["loglikelihood_acc_norm"], trust_dataset=True, + version=0, ) @@ -487,6 +500,7 @@ def hellaswag_prompt_arabic(line, task_name: str = None): few_shots_select="sequential", metric=["loglikelihood_acc_norm"], trust_dataset=True, + version=0, ) @@ -524,6 +538,7 @@ def toxigen_prompt_arabic(line, task_name: str = None): few_shots_select="sequential", metric=["loglikelihood_acc_norm"], trust_dataset=True, + version=0, ) diff --git a/community_tasks/german_rag_evals.py b/community_tasks/german_rag_evals.py index 55b35440..687eda0c 100644 --- a/community_tasks/german_rag_evals.py +++ b/community_tasks/german_rag_evals.py @@ -48,6 +48,7 @@ few_shots_split="test", few_shots_select="sequential", metric=["loglikelihood_acc"], + version=1, ) # Task 2: Choose context by question. @@ -64,6 +65,7 @@ few_shots_split="test", few_shots_select="sequential", metric=["loglikelihood_acc"], + version=1, ) @@ -81,6 +83,7 @@ few_shots_split="test", few_shots_select="sequential", metric=["loglikelihood_acc"], + version=1, ) # Task 4: Context-question match. @@ -97,6 +100,7 @@ few_shots_split="test", few_shots_select="sequential", metric=["loglikelihood_acc"], + version=1, ) diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py index f9df6fdd..f5c7a1f9 100644 --- a/src/lighteval/tasks/lighteval_task.py +++ b/src/lighteval/tasks/lighteval_task.py @@ -85,6 +85,7 @@ class LightevalTaskConfig: output_regex (str) frozen (bool) trust_dataset (bool): Whether to trust the dataset at execution or not + version (int): The version of the task. Defaults to 0. Can be increased if the underlying dataset or the prompt changes. """ name: str @@ -111,6 +112,8 @@ class LightevalTaskConfig: must_remove_duplicate_docs: bool = None + version: int = 0 + def as_dict(self): return { "name": self.name, @@ -127,6 +130,7 @@ def as_dict(self): "output_regex": self.output_regex, "frozen": self.frozen, "suite": self.suite, + "version": self.version, } def __post_init__(self): @@ -162,7 +166,7 @@ def __init__( # noqa: C901 containing task-specific functions. Defaults to None. """ self.name = name - self.VERSION = 0 + self.version = cfg.version self.is_main_process = False self.cache_dir = cache_dir self._cfg = cfg @@ -684,7 +688,7 @@ def create_requests_from_tasks( # noqa: C901 # logs out the diferent versions of the tasks for every few shot for num_fewshot, _ in fewshot_dict[task_name]: cur_task_name = f"{task_name}|{num_fewshot}" - evaluation_tracker.versions_logger.log(cur_task_name, task.VERSION) + evaluation_tracker.versions_logger.log(cur_task_name, task.version) rnd = random.Random() rnd.seed(42) diff --git a/src/lighteval/tasks/tasks_table.jsonl b/src/lighteval/tasks/tasks_table.jsonl index ecaeff04..83cbbbd1 100644 --- a/src/lighteval/tasks/tasks_table.jsonl +++ b/src/lighteval/tasks/tasks_table.jsonl @@ -1,1228 +1,1228 @@ -{"name":"abstract_narrative_understanding","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"abstract_narrative_understanding","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"agieval:aqua-rat","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-aqua-rat","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"agieval:gaokao-biology","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-biology","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"agieval:gaokao-chemistry","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-chemistry","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"agieval:gaokao-chinese","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-chinese","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"agieval:gaokao-english","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-english","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"agieval:gaokao-geography","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-geography","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"agieval:gaokao-history","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-history","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"agieval:gaokao-mathqa","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-mathqa","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"agieval:gaokao-physics","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-physics","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"agieval:logiqa-en","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-logiqa-en","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"agieval:logiqa-zh","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-logiqa-zh","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"agieval:lsat-ar","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-lsat-ar","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"agieval:lsat-lr","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-lsat-lr","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"agieval:lsat-rc","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-lsat-rc","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"agieval:sat-en","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-sat-en","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"agieval:sat-en-without-passage","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-sat-en-without-passage","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"agieval:sat-math","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-sat-math","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"anachronisms","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"anachronisms","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"analogical_similarity","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"analogical_similarity","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"analytic_entailment","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"analytic_entailment","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"anli","suite":["lighteval","anli"],"prompt_function":"anli","hf_repo":"anli","hf_subset":"plain_text","hf_avail_splits":["train_r1","dev_r1","train_r2","dev_r2","train_r3","dev_r3","test_r1","test_r2","test_r3"],"evaluation_splits":["test_r1","test_r2","test_r3"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"anli:r1","suite":["lighteval","anli"],"prompt_function":"anli","hf_repo":"anli","hf_subset":"plain_text","hf_avail_splits":["train_r1","dev_r1","test_r1"],"evaluation_splits":["test_r1"],"few_shots_split":"train_r1","few_shots_select":"random_sampling_from_train","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"anli:r2","suite":["lighteval","anli"],"prompt_function":"anli","hf_repo":"anli","hf_subset":"plain_text","hf_avail_splits":["train_r2","dev_r2","test_r2"],"evaluation_splits":["test_r2"],"few_shots_split":"train_r2","few_shots_select":"random_sampling_from_train","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"anli:r3","suite":["lighteval","anli"],"prompt_function":"anli","hf_repo":"anli","hf_subset":"plain_text","hf_avail_splits":["train_r3","dev_r3","test_r3"],"evaluation_splits":["test_r3"],"few_shots_split":"train_r3","few_shots_select":"random_sampling_from_train","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"arc:c:letters","suite":["original","arc"],"prompt_function":"arc_with_options_letters_predict","hf_repo":"ai2_arc","hf_subset":"ARC-Challenge","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"arc:c:options","suite":["original","arc"],"prompt_function":"arc_with_options","hf_repo":"ai2_arc","hf_subset":"ARC-Challenge","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"arc:c:simple","suite":["original","arc"],"prompt_function":"arc","hf_repo":"ai2_arc","hf_subset":"ARC-Challenge","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"arc:challenge","suite":["leaderboard","arc"],"prompt_function":"arc","hf_repo":"ai2_arc","hf_subset":"ARC-Challenge","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling_from_train","generation_size":1,"metric":["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"arc:easy","suite":["lighteval","arc"],"prompt_function":"arc","hf_repo":"ai2_arc","hf_subset":"ARC-Easy","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling_from_train","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"arithmetic:1dc","suite":["lighteval","arithmetic"],"prompt_function":"arithmetic","hf_repo":"EleutherAI\/arithmetic","hf_subset":"arithmetic_1dc","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"arithmetic:2da","suite":["lighteval","arithmetic"],"prompt_function":"arithmetic","hf_repo":"EleutherAI\/arithmetic","hf_subset":"arithmetic_2da","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"arithmetic:2dm","suite":["lighteval","arithmetic"],"prompt_function":"arithmetic","hf_repo":"EleutherAI\/arithmetic","hf_subset":"arithmetic_2dm","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"arithmetic:2ds","suite":["lighteval","arithmetic"],"prompt_function":"arithmetic","hf_repo":"EleutherAI\/arithmetic","hf_subset":"arithmetic_2ds","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"arithmetic:3da","suite":["lighteval","arithmetic"],"prompt_function":"arithmetic","hf_repo":"EleutherAI\/arithmetic","hf_subset":"arithmetic_3da","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"arithmetic:3ds","suite":["lighteval","arithmetic"],"prompt_function":"arithmetic","hf_repo":"EleutherAI\/arithmetic","hf_subset":"arithmetic_3ds","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"arithmetic:4da","suite":["lighteval","arithmetic"],"prompt_function":"arithmetic","hf_repo":"EleutherAI\/arithmetic","hf_subset":"arithmetic_4da","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"arithmetic:4ds","suite":["lighteval","arithmetic"],"prompt_function":"arithmetic","hf_repo":"EleutherAI\/arithmetic","hf_subset":"arithmetic_4ds","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"arithmetic:5da","suite":["lighteval","arithmetic"],"prompt_function":"arithmetic","hf_repo":"EleutherAI\/arithmetic","hf_subset":"arithmetic_5da","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"arithmetic:5ds","suite":["lighteval","arithmetic"],"prompt_function":"arithmetic","hf_repo":"EleutherAI\/arithmetic","hf_subset":"arithmetic_5ds","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"arithmetic_bb","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"arithmetic","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"ascii_word_recognition","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"ascii_word_recognition","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"asdiv","suite":["lighteval"],"prompt_function":"asdiv","hf_repo":"EleutherAI\/asdiv","hf_subset":"asdiv","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"authorship_verification","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"authorship_verification","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"auto_categorization","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"auto_categorization","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"auto_debugging","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_and_after_query","hf_repo":"bigbench","hf_subset":"auto_debugging","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["perfect_exact_match"],"stop_sequence":null,"output_regex":"[^\\.\\?\\!\\;\\n]+", "trust_dataset": true} -{"name":"babi_qa","suite":["helm"],"prompt_function":"babi_qa","hf_repo":"facebook\/babi_qa","hf_subset":"en-valid-qa1","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:causal_judgment","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"causal_judgement","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true} -{"name":"bigbench:date_understanding","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"date_understanding","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true} -{"name":"bigbench:disambiguation_qa","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"disambiguation_qa","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true} -{"name":"bigbench:geometric_shapes","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"geometric_shapes","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true} -{"name":"bigbench:logical_deduction_five_objects","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"logical_deduction_five_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true} -{"name":"bigbench:logical_deduction_seven_objects","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"logical_deduction_seven_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true} -{"name":"bigbench:logical_deduction_three_objects","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"logical_deduction_three_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true} -{"name":"bigbench:movie_recommendation","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"movie_recommendation","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true} -{"name":"bigbench:navigate","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"navigate","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true} -{"name":"bigbench:reasoning_about_colored_objects","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"reasoning_about_colored_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true} -{"name":"bigbench:ruin_names","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"ruin_names","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true} -{"name":"bigbench:salient_translation_error_detection","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"salient_translation_error_detection","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true} -{"name":"bigbench:snarks","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"snarks","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true} -{"name":"bigbench:sports_understanding","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"sports_understanding","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true} -{"name":"bigbench:temporal_sequences","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"temporal_sequences","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true} -{"name":"bigbench:tracking_shuffled_objects_five_objects","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"tracking_shuffled_objects_five_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true} -{"name":"bigbench:tracking_shuffled_objects_seven_objects","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"tracking_shuffled_objects_seven_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true} -{"name":"bigbench:tracking_shuffled_objects_three_objects","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"tracking_shuffled_objects_three_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true} -{"name":"bigbench:causal_judgment","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"causal_judgement","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true} -{"name":"bigbench:date_understanding","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"date_understanding","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true} -{"name":"bigbench:disambiguation_qa","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"disambiguation_qa","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true} -{"name":"bigbench:geometric_shapes","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"geometric_shapes","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true} -{"name":"bigbench:logical_deduction_five_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"logical_deduction_five_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true} -{"name":"bigbench:logical_deduction_seven_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"logical_deduction_seven_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true} -{"name":"bigbench:logical_deduction_three_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"logical_deduction_three_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true} -{"name":"bigbench:movie_recommendation","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"movie_recommendation","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true} -{"name":"bigbench:navigate","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"navigate","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true} -{"name":"bigbench:reasoning_about_colored_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"reasoning_about_colored_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true} -{"name":"bigbench:ruin_names","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"ruin_names","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true} -{"name":"bigbench:salient_translation_error_detection","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"salient_translation_error_detection","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true} -{"name":"bigbench:snarks","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"snarks","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true} -{"name":"bigbench:sports_understanding","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"sports_understanding","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true} -{"name":"bigbench:temporal_sequences","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"temporal_sequences","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true} -{"name":"bigbench:tracking_shuffled_objects_five_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"tracking_shuffled_objects_five_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true} -{"name":"bigbench:tracking_shuffled_objects_seven_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"tracking_shuffled_objects_seven_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true} -{"name":"bigbench:tracking_shuffled_objects_three_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"tracking_shuffled_objects_three_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true} -{"name":"bbh:boolean_expressions","suite":["harness"],"prompt_function":"bbh_boolean_expressions","hf_repo":"lukaemon/bbh","hf_subset":"boolean_expressions","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true} -{"name":"bbh:causal_judgment","suite":["harness"],"prompt_function":"bbh_causal_judgment","hf_repo":"lukaemon/bbh","hf_subset":"causal_judgement","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true} -{"name":"bbh:date_understanding","suite":["harness"],"prompt_function":"bbh_date_understanding","hf_repo":"lukaemon/bbh","hf_subset":"date_understanding","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true} -{"name":"bbh:disambiguation_qa","suite":["harness"],"prompt_function":"bbh_disambiguation_qa","hf_repo":"lukaemon/bbh","hf_subset":"disambiguation_qa","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true} -{"name":"bbh:dyck_languages","suite":["harness"],"prompt_function":"bbh_dyck_languages","hf_repo":"lukaemon/bbh","hf_subset":"dyck_languages","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true} -{"name":"bbh:formal_fallacies","suite":["harness"],"prompt_function":"bbh_formal_fallacies","hf_repo":"lukaemon/bbh","hf_subset":"formal_fallacies","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true} -{"name":"bbh:geometric_shapes","suite":["harness"],"prompt_function":"bbh_geometric_shapes","hf_repo":"lukaemon/bbh","hf_subset":"geometric_shapes","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true} -{"name":"bbh:hyperbaton","suite":["harness"],"prompt_function":"bbh_hyperbaton","hf_repo":"lukaemon/bbh","hf_subset":"hyperbaton","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true} -{"name":"bbh:logical_deduction_five_objects","suite":["harness"],"prompt_function":"bbh_logical_deduction_five_objects","hf_repo":"lukaemon/bbh","hf_subset":"logical_deduction_five_objects","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true} -{"name":"bbh:logical_deduction_seven_objects","suite":["harness"],"prompt_function":"bbh_logical_deduction_seven_objects","hf_repo":"lukaemon/bbh","hf_subset":"logical_deduction_seven_objects","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true} -{"name":"bbh:logical_deduction_three_objects","suite":["harness"],"prompt_function":"bbh_logical_deduction_three_objects","hf_repo":"lukaemon/bbh","hf_subset":"logical_deduction_three_objects","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true} -{"name":"bbh:movie_recommendation","suite":["harness"],"prompt_function":"bbh_movie_recommendation","hf_repo":"lukaemon/bbh","hf_subset":"movie_recommendation","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true} -{"name":"bbh:multistep_arithmetic_two","suite":["harness"],"prompt_function":"bbh_multistep_arithmetic_two","hf_repo":"lukaemon/bbh","hf_subset":"multistep_arithmetic_two","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true} -{"name":"bbh:navigate","suite":["harness"],"prompt_function":"bbh_navigate","hf_repo":"lukaemon/bbh","hf_subset":"navigate","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true} -{"name":"bbh:object_counting","suite":["harness"],"prompt_function":"bbh_object_counting","hf_repo":"lukaemon/bbh","hf_subset":"object_counting","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true} -{"name":"bbh:penguins_in_a_table","suite":["harness"],"prompt_function":"bbh_penguins_in_a_table","hf_repo":"lukaemon/bbh","hf_subset":"penguins_in_a_table","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true} -{"name":"bbh:reasoning_about_colored_objects","suite":["harness"],"prompt_function":"bbh_reasoning_about_colored_objects","hf_repo":"lukaemon/bbh","hf_subset":"reasoning_about_colored_objects","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true} -{"name":"bbh:ruin_names","suite":["harness"],"prompt_function":"bbh_ruin_names","hf_repo":"lukaemon/bbh","hf_subset":"ruin_names","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true} -{"name":"bbh:salient_translation_error_detection","suite":["harness"],"prompt_function":"bbh_salient_translation_error_detection","hf_repo":"lukaemon/bbh","hf_subset":"salient_translation_error_detection","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true} -{"name":"bbh:snarks","suite":["harness"],"prompt_function":"bbh_snarks","hf_repo":"lukaemon/bbh","hf_subset":"snarks","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true} -{"name":"bbh:sports_understanding","suite":["harness"],"prompt_function":"bbh_sports_understanding","hf_repo":"lukaemon/bbh","hf_subset":"sports_understanding","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true} -{"name":"bbh:temporal_sequences","suite":["harness"],"prompt_function":"bbh_temporal_sequences","hf_repo":"lukaemon/bbh","hf_subset":"temporal_sequences","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true} -{"name":"bbh:tracking_shuffled_objects_five_objects","suite":["harness"],"prompt_function":"bbh_tracking_shuffled_objects_five_objects","hf_repo":"lukaemon/bbh","hf_subset":"tracking_shuffled_objects_five_objects","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true} -{"name":"bbh:tracking_shuffled_objects_seven_objects","suite":["harness"],"prompt_function":"bbh_tracking_shuffled_objects_seven_objects","hf_repo":"lukaemon/bbh","hf_subset":"tracking_shuffled_objects_seven_objects","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true} -{"name":"bbh:tracking_shuffled_objects_three_objects","suite":["harness"],"prompt_function":"bbh_tracking_shuffled_objects_three_objects","hf_repo":"lukaemon/bbh","hf_subset":"tracking_shuffled_objects_three_objects","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true} -{"name":"bbh:web_of_lies","suite":["harness"],"prompt_function":"bbh_web_of_lies","hf_repo":"lukaemon/bbh","hf_subset":"web_of_lies","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true} -{"name":"bbh:word_sorting","suite":["harness"],"prompt_function":"bbh_word_sorting","hf_repo":"lukaemon/bbh","hf_subset":"word_sorting","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true} -{"name":"bbq","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"all","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bbq:Age","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"Age","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bbq:Disability_status","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"Disability_status","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bbq:Gender_identity","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"Gender_identity","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bbq:Nationality","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"Nationality","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bbq:Physical_appearance","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"Physical_appearance","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bbq:Race_ethnicity","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"Race_ethnicity","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bbq:Race_x_SES","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"Race_x_SES","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bbq:Race_x_gender","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"Race_x_gender","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bbq:Religion","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"Religion","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bbq:SES","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"SES","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bbq:Sexual_orientation","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"Sexual_orientation","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bbq_lite_json","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_whitespace_after_query","hf_repo":"bigbench","hf_subset":"bbq_lite_json","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:auto_debugging","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"auto_debugging","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:bbq_lite_json:age_ambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-age_ambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:bbq_lite_json:age_disambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-age_disambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:bbq_lite_json:disability_status_ambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-disability_status_ambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:bbq_lite_json:disability_status_disambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-disability_status_disambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:bbq_lite_json:gender_identity_ambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-gender_identity_ambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:bbq_lite_json:gender_identity_disambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-gender_identity_disambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:bbq_lite_json:nationality_ambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-nationality_ambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:bbq_lite_json:nationality_disambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-nationality_disambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:bbq_lite_json:physical_appearance_ambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-physical_appearance_ambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:bbq_lite_json:physical_appearance_disambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-physical_appearance_disambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:bbq_lite_json:race_ethnicity_ambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-race_ethnicity_ambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:bbq_lite_json:race_ethnicity_disambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-race_ethnicity_disambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:bbq_lite_json:religion_ambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-religion_ambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:bbq_lite_json:religion_disambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-religion_disambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:bbq_lite_json:ses_ambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-ses_ambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:bbq_lite_json:ses_disambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-ses_disambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:bbq_lite_json:sexual_orientation_ambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-sexual_orientation_ambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:bbq_lite_json:sexual_orientation_disambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-sexual_orientation_disambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:code_line_description","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"code_line_description","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:conceptual_combinations:contradictions","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conceptual_combinations-contradictions","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:conceptual_combinations:emergent_properties","suite":["helm"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conceptual_combinations-emergent_properties","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:conceptual_combinations:fanciful_fictional_combinations","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conceptual_combinations-fanciful_fictional_combinations","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:conceptual_combinations:homonyms","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conceptual_combinations-homonyms","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:conceptual_combinations:invented_words","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conceptual_combinations-invented_words","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:conlang_translation:adna_from","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-adna_from","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:conlang_translation:adna_to","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-adna_to","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:conlang_translation:atikampe_from","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-atikampe_from","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:conlang_translation:atikampe_to","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-atikampe_to","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:conlang_translation:gornam_from","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-gornam_from","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:conlang_translation:gornam_to","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-gornam_to","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:conlang_translation:holuan_from","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-holuan_from","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:conlang_translation:holuan_to","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-holuan_to","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:conlang_translation:mkafala_from","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-mkafala_from","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:conlang_translation:mkafala_to","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-mkafala_to","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:conlang_translation:postpositive_english_from","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-postpositive_english_from","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:conlang_translation:postpositive_english_to","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-postpositive_english_to","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:conlang_translation:unapuri_from","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-unapuri_from","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:conlang_translation:unapuri_to","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-unapuri_to","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:conlang_translation:vaomi_from","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-vaomi_from","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:conlang_translation:vaomi_to","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-vaomi_to","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:emoji_movie","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"emoji_movie","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:formal_fallacies_syllogisms_negation","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"formal_fallacies_syllogisms_negation","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:hindu_knowledge","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"hindu_knowledge","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:known_unknowns","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"known_unknowns","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:language_identification","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"language_identification","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:linguistics_puzzles","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"linguistics_puzzles","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:logic_grid_puzzle","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"logic_grid_puzzle","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:logical_deduction-five_objects","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"logical_deduction-five_objects","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:logical_deduction-seven_objects","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"logical_deduction-seven_objects","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:logical_deduction-three_objects","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"logical_deduction-three_objects","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:misconceptions_russian","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"misconceptions_russian","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:novel_concepts","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"novel_concepts","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:operators","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"operators","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:parsinlu_reading_comprehension","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"parsinlu_reading_comprehension","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:play_dialog_same_or_different","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"play_dialog_same_or_different","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:repeat_copy_logic","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"repeat_copy_logic","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:strange_stories-boolean","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"strange_stories-boolean","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:strange_stories-multiple_choice","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"strange_stories-multiple_choice","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:strategyqa","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"strategyqa","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:symbol_interpretation-adversarial","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"symbol_interpretation-adversarial","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:symbol_interpretation-emoji_agnostic","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"symbol_interpretation-emoji_agnostic","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:symbol_interpretation-name_agnostic","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"symbol_interpretation-name_agnostic","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:symbol_interpretation-plain","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"symbol_interpretation-plain","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:symbol_interpretation-tricky","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"symbol_interpretation-tricky","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:vitaminc_fact_verification","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"vitaminc_fact_verification","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bigbench:winowhy","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"winowhy","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:adjunct_island","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"adjunct_island","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:adjunct_island","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"adjunct_island","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:anaphor_gender_agreement","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"anaphor_gender_agreement","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:anaphor_gender_agreement","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"anaphor_gender_agreement","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:anaphor_number_agreement","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"anaphor_number_agreement","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:anaphor_number_agreement","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"anaphor_number_agreement","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:animate_subject_passive","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"animate_subject_passive","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:animate_subject_passive","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"animate_subject_passive","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:animate_subject_trans","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"animate_subject_trans","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:animate_subject_trans","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"animate_subject_trans","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:causative","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"causative","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:causative","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"causative","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:complex_NP_island","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"complex_NP_island","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:complex_NP_island","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"complex_NP_island","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:coordinate_structure_constraint_complex_left_branch","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"coordinate_structure_constraint_complex_left_branch","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:coordinate_structure_constraint_complex_left_branch","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"coordinate_structure_constraint_complex_left_branch","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:coordinate_structure_constraint_object_extraction","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"coordinate_structure_constraint_object_extraction","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:coordinate_structure_constraint_object_extraction","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"coordinate_structure_constraint_object_extraction","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:determiner_noun_agreement_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:determiner_noun_agreement_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:determiner_noun_agreement_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:determiner_noun_agreement_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:determiner_noun_agreement_irregular_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_irregular_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:determiner_noun_agreement_irregular_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_irregular_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:determiner_noun_agreement_irregular_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_irregular_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:determiner_noun_agreement_irregular_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_irregular_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:determiner_noun_agreement_with_adj_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_with_adj_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:determiner_noun_agreement_with_adj_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_with_adj_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:determiner_noun_agreement_with_adj_irregular_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_with_adj_irregular_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:determiner_noun_agreement_with_adj_irregular_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_with_adj_irregular_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:determiner_noun_agreement_with_adj_irregular_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_with_adj_irregular_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:determiner_noun_agreement_with_adj_irregular_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_with_adj_irregular_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:determiner_noun_agreement_with_adjective_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_with_adjective_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:determiner_noun_agreement_with_adjective_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_with_adjective_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:distractor_agreement_relational_noun","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"distractor_agreement_relational_noun","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:distractor_agreement_relational_noun","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"distractor_agreement_relational_noun","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:distractor_agreement_relative_clause","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"distractor_agreement_relative_clause","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:distractor_agreement_relative_clause","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"distractor_agreement_relative_clause","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:drop_argument","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"drop_argument","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:drop_argument","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"drop_argument","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:ellipsis_n_bar_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"ellipsis_n_bar_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:ellipsis_n_bar_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"ellipsis_n_bar_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:ellipsis_n_bar_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"ellipsis_n_bar_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:ellipsis_n_bar_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"ellipsis_n_bar_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:existential_there_object_raising","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"existential_there_object_raising","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:existential_there_object_raising","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"existential_there_object_raising","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:existential_there_quantifiers_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"existential_there_quantifiers_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:existential_there_quantifiers_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"existential_there_quantifiers_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:existential_there_quantifiers_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"existential_there_quantifiers_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:existential_there_quantifiers_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"existential_there_quantifiers_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:existential_there_subject_raising","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"existential_there_subject_raising","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:existential_there_subject_raising","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"existential_there_subject_raising","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:expletive_it_object_raising","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"expletive_it_object_raising","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:expletive_it_object_raising","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"expletive_it_object_raising","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:inchoative","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"inchoative","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:inchoative","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"inchoative","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:intransitive","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"intransitive","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:intransitive","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"intransitive","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:irregular_past_participle_adjectives","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"irregular_past_participle_adjectives","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:irregular_past_participle_adjectives","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"irregular_past_participle_adjectives","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:irregular_past_participle_verbs","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"irregular_past_participle_verbs","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:irregular_past_participle_verbs","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"irregular_past_participle_verbs","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:irregular_plural_subject_verb_agreement_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"irregular_plural_subject_verb_agreement_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:irregular_plural_subject_verb_agreement_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"irregular_plural_subject_verb_agreement_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:irregular_plural_subject_verb_agreement_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"irregular_plural_subject_verb_agreement_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:irregular_plural_subject_verb_agreement_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"irregular_plural_subject_verb_agreement_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:left_branch_island_echo_question","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"left_branch_island_echo_question","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:left_branch_island_echo_question","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"left_branch_island_echo_question","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:left_branch_island_simple_question","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"left_branch_island_simple_question","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:left_branch_island_simple_question","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"left_branch_island_simple_question","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:matrix_question_npi_licensor_present","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"matrix_question_npi_licensor_present","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:matrix_question_npi_licensor_present","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"matrix_question_npi_licensor_present","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:npi_present_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"npi_present_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:npi_present_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"npi_present_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:npi_present_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"npi_present_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:npi_present_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"npi_present_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:only_npi_licensor_present","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"only_npi_licensor_present","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:only_npi_licensor_present","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"only_npi_licensor_present","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:only_npi_scope","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"only_npi_scope","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:only_npi_scope","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"only_npi_scope","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:passive_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"passive_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:passive_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"passive_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:passive_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"passive_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:passive_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"passive_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:principle_A_c_command","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"principle_A_c_command","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:principle_A_c_command","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"principle_A_c_command","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:principle_A_case_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"principle_A_case_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:principle_A_case_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"principle_A_case_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:principle_A_case_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"principle_A_case_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:principle_A_case_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"principle_A_case_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:principle_A_domain_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"principle_A_domain_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:principle_A_domain_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"principle_A_domain_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:principle_A_domain_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"principle_A_domain_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:principle_A_domain_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"principle_A_domain_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:principle_A_domain_3","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"principle_A_domain_3","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:principle_A_domain_3","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"principle_A_domain_3","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:principle_A_reconstruction","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"principle_A_reconstruction","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:principle_A_reconstruction","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"principle_A_reconstruction","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:regular_plural_subject_verb_agreement_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"regular_plural_subject_verb_agreement_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:regular_plural_subject_verb_agreement_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"regular_plural_subject_verb_agreement_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:regular_plural_subject_verb_agreement_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"regular_plural_subject_verb_agreement_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:regular_plural_subject_verb_agreement_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"regular_plural_subject_verb_agreement_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:sentential_negation_npi_licensor_present","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"sentential_negation_npi_licensor_present","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:sentential_negation_npi_licensor_present","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"sentential_negation_npi_licensor_present","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:sentential_negation_npi_scope","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"sentential_negation_npi_scope","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:sentential_negation_npi_scope","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"sentential_negation_npi_scope","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:sentential_subject_island","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"sentential_subject_island","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:sentential_subject_island","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"sentential_subject_island","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:superlative_quantifiers_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"superlative_quantifiers_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:superlative_quantifiers_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"superlative_quantifiers_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:superlative_quantifiers_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"superlative_quantifiers_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:superlative_quantifiers_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"superlative_quantifiers_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:tough_vs_raising_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"tough_vs_raising_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:tough_vs_raising_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"tough_vs_raising_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:tough_vs_raising_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"tough_vs_raising_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:tough_vs_raising_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"tough_vs_raising_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:transitive","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"transitive","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:transitive","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"transitive","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:wh_island","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"wh_island","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:wh_island","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"wh_island","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:wh_questions_object_gap","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"wh_questions_object_gap","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:wh_questions_object_gap","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"wh_questions_object_gap","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:wh_questions_subject_gap","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"wh_questions_subject_gap","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:wh_questions_subject_gap","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"wh_questions_subject_gap","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:wh_questions_subject_gap_long_distance","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"wh_questions_subject_gap_long_distance","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:wh_questions_subject_gap_long_distance","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"wh_questions_subject_gap_long_distance","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:wh_vs_that_no_gap","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"wh_vs_that_no_gap","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:wh_vs_that_no_gap","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"wh_vs_that_no_gap","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:wh_vs_that_no_gap_long_distance","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"wh_vs_that_no_gap_long_distance","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:wh_vs_that_no_gap_long_distance","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"wh_vs_that_no_gap_long_distance","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:wh_vs_that_with_gap","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"wh_vs_that_with_gap","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:wh_vs_that_with_gap","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"wh_vs_that_with_gap","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:wh_vs_that_with_gap_long_distance","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"wh_vs_that_with_gap_long_distance","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"blimp:wh_vs_that_with_gap_long_distance","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"wh_vs_that_with_gap_long_distance","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bold","suite":["helm"],"prompt_function":"bold","hf_repo":"lighteval\/bold_helm","hf_subset":"all","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["prediction_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bold:gender","suite":["helm"],"prompt_function":"bold","hf_repo":"lighteval\/bold_helm","hf_subset":"gender","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["prediction_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bold:political_ideology","suite":["helm"],"prompt_function":"bold","hf_repo":"lighteval\/bold_helm","hf_subset":"political_ideology","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["prediction_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bold:profession","suite":["helm"],"prompt_function":"bold","hf_repo":"lighteval\/bold_helm","hf_subset":"profession","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["prediction_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bold:race","suite":["helm"],"prompt_function":"bold","hf_repo":"lighteval\/bold_helm","hf_subset":"race","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["prediction_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bold:religious_ideology","suite":["helm"],"prompt_function":"bold","hf_repo":"lighteval\/bold_helm","hf_subset":"religious_ideology","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["prediction_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"boolq","suite":["helm","helm_general"],"prompt_function":"boolq_helm","hf_repo":"lighteval\/boolq_helm","hf_subset":"default","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"boolq:contrastset","suite":["helm"],"prompt_function":"boolq_helm_contrastset","hf_repo":"lighteval\/boolq_helm","hf_subset":"default","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"bridging_anaphora_resolution_barqa","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"bridging_anaphora_resolution_barqa","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"causal_judgment","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"causal_judgment","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"cause_and_effect","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"cause_and_effect","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"checkmate_in_one","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"checkmate_in_one","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"chess_state_tracking","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"chess_state_tracking","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"chinese_remainder_theorem","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"chinese_remainder_theorem","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"cifar10_classification","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"cifar10_classification","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"civil_comments","suite":["helm","helm_general"],"prompt_function":"civil_comments","hf_repo":"lighteval\/civil_comments_helm","hf_subset":"all","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"civil_comments:LGBTQ","suite":["helm"],"prompt_function":"civil_comments","hf_repo":"lighteval\/civil_comments_helm","hf_subset":"LGBTQ","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"civil_comments:black","suite":["helm"],"prompt_function":"civil_comments","hf_repo":"lighteval\/civil_comments_helm","hf_subset":"black","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"civil_comments:christian","suite":["helm"],"prompt_function":"civil_comments","hf_repo":"lighteval\/civil_comments_helm","hf_subset":"christian","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"civil_comments:female","suite":["helm"],"prompt_function":"civil_comments","hf_repo":"lighteval\/civil_comments_helm","hf_subset":"female","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"civil_comments:male","suite":["helm"],"prompt_function":"civil_comments","hf_repo":"lighteval\/civil_comments_helm","hf_subset":"male","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"civil_comments:muslim","suite":["helm"],"prompt_function":"civil_comments","hf_repo":"lighteval\/civil_comments_helm","hf_subset":"muslim","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"civil_comments:other_religions","suite":["helm"],"prompt_function":"civil_comments","hf_repo":"lighteval\/civil_comments_helm","hf_subset":"other_religions","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"civil_comments:white","suite":["helm"],"prompt_function":"civil_comments","hf_repo":"lighteval\/civil_comments_helm","hf_subset":"white","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"code_line_description","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_and_after_query","hf_repo":"bigbench","hf_subset":"code_line_description","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"codenames","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"codenames","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["rouge_t5","bleu","bleu"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"color","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"color","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["rouge_t5","bleu","loglikelihood_acc","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"common_morpheme","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"common_morpheme","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"commonsenseqa","suite":["helm","commonsense_scenario"],"prompt_function":"commonsense_qa","hf_repo":"commonsense_qa","hf_subset":"default","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"conceptual_combinations","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_whitespace_after_query","hf_repo":"bigbench","hf_subset":"conceptual_combinations","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"conlang_translation","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_whitespace_after_query","hf_repo":"bigbench","hf_subset":"conlang_translation","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge_t5","bleu","perfect_exact_match"],"stop_sequence":[".",";","!","?"],"output_regex":"[^\\.\\?\\!\\;\\n]+", "trust_dataset": true} -{"name":"contextual_parametric_knowledge_conflicts","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"contextual_parametric_knowledge_conflicts","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["rouge_t5","loglikelihood_acc","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"copyright:n_books_1000-extractions_per_book_1-prefix_length_125","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"n_books_1000-extractions_per_book_1-prefix_length_125","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"copyright:n_books_1000-extractions_per_book_1-prefix_length_25","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"n_books_1000-extractions_per_book_1-prefix_length_25","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"copyright:n_books_1000-extractions_per_book_1-prefix_length_5","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"n_books_1000-extractions_per_book_1-prefix_length_5","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"copyright:n_books_1000-extractions_per_book_3-prefix_length_125","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"n_books_1000-extractions_per_book_3-prefix_length_125","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"copyright:n_books_1000-extractions_per_book_3-prefix_length_25","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"n_books_1000-extractions_per_book_3-prefix_length_25","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"copyright:n_books_1000-extractions_per_book_3-prefix_length_5","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"n_books_1000-extractions_per_book_3-prefix_length_5","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"copyright:oh_the_places","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"oh_the_places","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"copyright:pilot","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"pilot","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"copyright:popular_books-prefix_length_10","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"popular_books-prefix_length_10","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"copyright:popular_books-prefix_length_125","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"popular_books-prefix_length_125","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"copyright:popular_books-prefix_length_25","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"popular_books-prefix_length_25","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"copyright:popular_books-prefix_length_250","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"popular_books-prefix_length_250","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"copyright:popular_books-prefix_length_5","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"popular_books-prefix_length_5","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"copyright:popular_books-prefix_length_50","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"popular_books-prefix_length_50","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"copyright:prompt_num_line_1-min_lines_20","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"prompt_num_line_1-min_lines_20","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"copyright:prompt_num_line_10-min_lines_20","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"prompt_num_line_10-min_lines_20","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"copyright:prompt_num_line_5-min_lines_20","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"prompt_num_line_5-min_lines_20","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"coqa","suite":["lighteval"],"prompt_function":"coqa","hf_repo":"coqa","hf_subset":"default","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["perfect_exact_match","f1_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"coqa_bb","suite":["lighteval","bigbench_programmatic","bigbench"],"prompt_function":"coqa","hf_repo":"coqa","hf_subset":"default","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["perfect_exact_match","f1_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"covid_dialogue","suite":["helm"],"prompt_function":"covid_dialogue","hf_repo":"lighteval\/covid_dialogue","hf_subset":"default","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":128,"metric":["exact_match","quasi_exact_match","f1_score","rougeL","bleu_1","bleu_4"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"crash_blossom","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"crash_blossom","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"crass_ai","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"crass_ai","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"cryobiology_spanish","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"cryobiology_spanish","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"cryptonite","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"cryptonite","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"cs_algorithms","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"cs_algorithms","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"dark_humor_detection","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"dark_humor_detection","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"date_understanding","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"date_understanding","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"disambiguation_qa","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"disambiguation_qa","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"discourse_marker_prediction","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"discourse_marker_prediction","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"disfl_qa","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"disfl_qa","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"drop","suite":["lighteval"],"prompt_function":"drop","hf_repo":"lighteval/drop_harness","hf_subset":"default","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":"train","few_shots_select":"random_sampling_from_train","generation_size":null,"metric":["drop"],"stop_sequence":["."],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"dyck_language:2","suite":["helm"],"prompt_function":"dyck_language","hf_repo":"lighteval\/DyckLanguage","hf_subset":"2","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"dyck_language:3","suite":["helm"],"prompt_function":"dyck_language","hf_repo":"lighteval\/DyckLanguage","hf_subset":"3","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"dyck_language:4","suite":["helm"],"prompt_function":"dyck_language","hf_repo":"lighteval\/DyckLanguage","hf_subset":"4","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"dyck_languages","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"dyck_languages","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"elementary_math_qa","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"elementary_math_qa","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"emoji_movie","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_whitespace_after_query","hf_repo":"bigbench","hf_subset":"emoji_movie","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["rouge_t5","bleu","loglikelihood_acc","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"emojis_emotion_prediction","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"emojis_emotion_prediction","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"empirical_judgments","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"empirical_judgments","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"english_proverbs","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"english_proverbs","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"english_russian_proverbs","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"english_russian_proverbs","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"entailed_polarity","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"entailed_polarity","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"entailed_polarity_hindi","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"entailed_polarity_hindi","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"entity_data_imputation:Buy","suite":["helm"],"prompt_function":"entity_data_imputation","hf_repo":"lighteval\/Buy","hf_subset":"default","hf_avail_splits":["train","test","valid"],"evaluation_splits":["valid","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"entity_data_imputation:Restaurant","suite":["helm"],"prompt_function":"entity_data_imputation","hf_repo":"lighteval\/Restaurant","hf_subset":"default","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"entity_matching:Abt_Buy","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"Abt_Buy","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"entity_matching:Amazon_Google","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"Amazon_Google","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"entity_matching:Beer","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"Beer","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"entity_matching:Company","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"Company","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"entity_matching:DBLP_ACM","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"DBLP_ACM","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"entity_matching:DBLP_GoogleScholar","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"DBLP_GoogleScholar","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"entity_matching:Dirty_DBLP_ACM","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"Dirty_DBLP_ACM","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"entity_matching:Dirty_DBLP_GoogleScholar","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"Dirty_DBLP_GoogleScholar","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"entity_matching:Dirty_Walmart_Amazon","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"Dirty_Walmart_Amazon","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"entity_matching:Dirty_iTunes_Amazon","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"Dirty_iTunes_Amazon","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"entity_matching:Fodors_Zagats","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"Fodors_Zagats","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"entity_matching:Walmart_Amazon","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"Walmart_Amazon","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"entity_matching:iTunes_Amazon","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"iTunes_Amazon","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"epistemic_reasoning","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"epistemic_reasoning","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"ethics:commonsense","suite":["lighteval","ethics"],"prompt_function":"ethics_commonsense","hf_repo":"lighteval\/hendrycks_ethics","hf_subset":"commonsense","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"ethics:deontology","suite":["lighteval","ethics"],"prompt_function":"ethics_deontology","hf_repo":"lighteval\/hendrycks_ethics","hf_subset":"deontology","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"ethics:justice","suite":["lighteval","ethics"],"prompt_function":"ethics_justice","hf_repo":"lighteval\/hendrycks_ethics","hf_subset":"justice","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"ethics:utilitarianism","suite":["lighteval","ethics"],"prompt_function":"ethics_utilitarianism","hf_repo":"lighteval\/hendrycks_ethics","hf_subset":"utilitarianism","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"ethics:virtue","suite":["lighteval","ethics"],"prompt_function":"ethics_virtue","hf_repo":"lighteval\/hendrycks_ethics","hf_subset":"virtue","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"evaluating_information_essentiality","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"evaluating_information_essentiality","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"fact_checker","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"fact_checker","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"fantasy_reasoning","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"fantasy_reasoning","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"few_shot_nlg","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"few_shot_nlg","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","bleurt"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"figure_of_speech_detection","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"figure_of_speech_detection","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"formal_fallacies_syllogisms_negation","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_whitespace_after_query","hf_repo":"bigbench","hf_subset":"formal_fallacies_syllogisms_negation","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"gem","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"gem","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","rouge_t5"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"gender_inclusive_sentences_german","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"gender_inclusive_sentences_german","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"general_knowledge","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"general_knowledge","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"geometric_shapes","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"geometric_shapes","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["rouge_t5","bleu","loglikelihood_acc","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"glue:cola","suite":["lighteval","glue"],"prompt_function":"cola","hf_repo":"glue","hf_subset":"cola","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc_single_token", "mcc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"glue:mnli","suite":["lighteval","glue"],"prompt_function":"mnli","hf_repo":"glue","hf_subset":"mnli_matched","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"glue:mnli_mismatched","suite":["lighteval","glue"],"prompt_function":"mnli","hf_repo":"glue","hf_subset":"mnli_mismatched","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"glue:mrpc","suite":["lighteval","glue"],"prompt_function":"mrpc","hf_repo":"glue","hf_subset":"mrpc","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc", "loglikelihood_f1"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"glue:qnli","suite":["lighteval","glue"],"prompt_function":"qnli","hf_repo":"glue","hf_subset":"qnli","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"glue:qqp","suite":["lighteval","glue"],"prompt_function":"qqp","hf_repo":"glue","hf_subset":"qqp","hf_avail_splits":["train","validation","test"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc", "loglikelihood_f1"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"glue:rte","suite":["lighteval","glue"],"prompt_function":"rte","hf_repo":"glue","hf_subset":"rte","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"glue:sst2","suite":["lighteval","glue"],"prompt_function":"sst","hf_repo":"glue","hf_subset":"sst2","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"glue:stsb","suite":["lighteval","glue"],"prompt_function":"stsb","hf_repo":"glue","hf_subset":"stsb","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"glue:wnli","suite":["lighteval","glue"],"prompt_function":"wnli","hf_repo":"glue","hf_subset":"wnli","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"goal_step_wikihow","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"goal_step_wikihow","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"gpqa","suite":["lighteval"],"prompt_function":"gpqa","hf_repo":"Idavidrein/gpqa","hf_subset":"gpqa_main","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"gre_reading_comprehension","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"gre_reading_comprehension","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"gsm8k","suite":["leaderboard"],"prompt_function":"gsm8k","hf_repo":"gsm8k","hf_subset":"main","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling_from_train","generation_size":256,"metric":["quasi_exact_match_gsm8k"],"stop_sequence":["Question:","Question",":"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"gsm8k","suite":["lighteval"],"prompt_function":"gsm8k","hf_repo":"gsm8k","hf_subset":"main","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling_from_train","generation_size":256,"metric":["quasi_exact_match_gsm8k","maj_at_8_gsm8k"],"stop_sequence":["Question:"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"headqa:en","suite":["lighteval","headqa"],"prompt_function":"headqa","hf_repo":"lighteval/headqa_harness","hf_subset":"en","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"headqa:es","suite":["lighteval","headqa"],"prompt_function":"headqa","hf_repo":"lighteval/headqa_harness","hf_subset":"es","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"hellaswag","suite":["leaderboard"],"prompt_function":"hellaswag_harness","hf_repo":"hellaswag","hf_subset":"default","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":"random_sampling_from_train","generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"hellaswag","suite":["helm","helm_general"],"prompt_function":"hellaswag_helm","hf_repo":"hellaswag","hf_subset":"default","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"hhh_alignment","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"hhh_alignment","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"hindi_question_answering","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"hindi_question_answering","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","rouge_t5","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"hindu_knowledge","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_whitespace_after_query","hf_repo":"bigbench","hf_subset":"hindu_knowledge","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"hinglish_toxicity","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"hinglish_toxicity","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"human_organs_senses","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"human_organs_senses","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"humaneval","suite":["helm","code_scenario"],"prompt_function":"humaneval","hf_repo":"openai_humaneval","hf_subset":"openai_humaneval","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":600,"metric":["code_humaneval"],"stop_sequence":["\nclass","\ndef","\nif","\nprint"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"hyperbaton","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"hyperbaton","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"identify_math_theorems","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"identify_math_theorems","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"identify_odd_metaphor","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"identify_odd_metaphor","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"imdb","suite":["helm","helm_general"],"prompt_function":"imdb","hf_repo":"lighteval\/IMDB_helm","hf_subset":"default","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"imdb:contrastset","suite":["helm"],"prompt_function":"imdb_contrastset","hf_repo":"lighteval\/IMDB_helm","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"implicatures","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"implicatures","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"implicit_relations","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"implicit_relations","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"intent_recognition","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"intent_recognition","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"interactive_qa_mmlu:abstract_algebra","suite":["helm","interactive_qa_mmlu_scenario"],"prompt_function":"mmlu_qa_abstract_algebra","hf_repo":"lighteval\/mmlu","hf_subset":"abstract_algebra","hf_avail_splits":["dev","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"interactive_qa_mmlu:college_chemistry","suite":["helm","interactive_qa_mmlu_scenario"],"prompt_function":"mmlu_qa_college_chemistry","hf_repo":"lighteval\/mmlu","hf_subset":"college_chemistry","hf_avail_splits":["dev","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"interactive_qa_mmlu:global_facts","suite":["helm","interactive_qa_mmlu_scenario"],"prompt_function":"mmlu_qa_global_facts","hf_repo":"lighteval\/mmlu","hf_subset":"global_facts","hf_avail_splits":["dev","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"interactive_qa_mmlu:miscellaneous","suite":["helm","interactive_qa_mmlu_scenario"],"prompt_function":"mmlu_qa_miscellaneous","hf_repo":"lighteval\/mmlu","hf_subset":"miscellaneous","hf_avail_splits":["dev","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"interactive_qa_mmlu:nutrition","suite":["helm","interactive_qa_mmlu_scenario"],"prompt_function":"mmlu_qa_nutrition","hf_repo":"lighteval\/mmlu","hf_subset":"nutrition","hf_avail_splits":["dev","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"interactive_qa_mmlu:us_foreign_policy","suite":["helm","interactive_qa_mmlu_scenario"],"prompt_function":"mmlu_qa_us_foreign_policy","hf_repo":"lighteval\/mmlu","hf_subset":"us_foreign_policy","hf_avail_splits":["dev","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"international_phonetic_alphabet_nli","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"international_phonetic_alphabet_nli","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"international_phonetic_alphabet_transliterate","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"international_phonetic_alphabet_transliterate","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","rouge_t5","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"intersect_geometry","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"intersect_geometry","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"irony_identification","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"irony_identification","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"iwslt17:ar-en","suite":["lighteval","harness_selection"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_ar-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"iwslt17:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"iwslt17:en-ar","suite":["lighteval","harness_selection"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_ar-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"iwslt17:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"iwslt17:en-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_en-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"iwslt17:en-ja","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_en-ja","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"iwslt17:en-ko","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_en-ko","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"iwslt17:en-zh","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_en-zh","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"iwslt17:fr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_fr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"iwslt17:ja-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_ja-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"iwslt17:ko-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_ko-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"iwslt17:zh-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_zh-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"kanji_ascii","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"kanji_ascii","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"kannada","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"kannada","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"key_value_maps","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"key_value_maps","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"known_unknowns","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_whitespace_after_query","hf_repo":"bigbench","hf_subset":"known_unknowns","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"lambada:standard","suite":["lighteval","lambada"],"prompt_function":"lambada","hf_repo":"lambada","hf_subset":"plain_text","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["target_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"lambada:standard_cloze","suite":["lighteval","lambada"],"prompt_function":"lambada_cloze","hf_repo":"lambada","hf_subset":"plain_text","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["target_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"lambada:openai","suite":["lighteval","lambada"],"prompt_function":"lambada","hf_repo":"EleutherAI\/lambada_openai","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["target_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"lambada:openai:de","suite":["lighteval","lambada"],"prompt_function":"lambada","hf_repo":"EleutherAI\/lambada_openai","hf_subset":"de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["target_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"lambada:openai:en","suite":["lighteval","lambada"],"prompt_function":"lambada","hf_repo":"EleutherAI\/lambada_openai","hf_subset":"en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["target_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"lambada:openai:es","suite":["lighteval","lambada"],"prompt_function":"lambada","hf_repo":"EleutherAI\/lambada_openai","hf_subset":"es","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["target_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"lambada:openai:fr","suite":["lighteval","lambada"],"prompt_function":"lambada","hf_repo":"EleutherAI\/lambada_openai","hf_subset":"fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["target_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"lambada:openai:it","suite":["lighteval","lambada"],"prompt_function":"lambada","hf_repo":"EleutherAI\/lambada_openai","hf_subset":"it","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["target_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"lambada:openai_cloze","suite":["lighteval","lambada"],"prompt_function":"lambada_cloze","hf_repo":"EleutherAI\/lambada_openai","hf_subset":"en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["target_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"language_games","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"language_games","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","rouge_t5","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"language_identification","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"language_identification","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"legal_summarization:billsum","suite":["helm"],"prompt_function":"legal_summarization","hf_repo":"lighteval\/legal_summarization","hf_subset":"BillSum","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":1024,"metric":["rouge1","rouge2","rougeL","faithfulness","extractiveness","bert_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"legal_summarization:eurlexsum","suite":["helm"],"prompt_function":"legal_summarization","hf_repo":"lighteval\/legal_summarization","hf_subset":"EurLexSum","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["rouge1","rouge2","rougeL","faithfulness","extractiveness","bert_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"legal_summarization:multilexsum","suite":["helm"],"prompt_function":"multilexsum","hf_repo":"lighteval\/legal_summarization","hf_subset":"MultiLexSum","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":256,"metric":["rouge1","rouge2","rougeL","faithfulness","extractiveness","bert_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"legalsupport","suite":["helm"],"prompt_function":"legal_support","hf_repo":"lighteval\/LegalSupport","hf_subset":"default","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["loglikelihood_acc","exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"lexglue:case_hold","suite":["helm","lex_glue_scenario"],"prompt_function":"lex_glue_case_hold","hf_repo":"lighteval\/lexglue","hf_subset":"case_hold","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"lexglue:ecthr_a","suite":["helm","lex_glue_scenario"],"prompt_function":"lex_glue_ecthr_a","hf_repo":"lighteval\/lexglue","hf_subset":"ecthr_a","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"lexglue:ecthr_b","suite":["helm","lex_glue_scenario"],"prompt_function":"lex_glue_ecthr_b","hf_repo":"lighteval\/lexglue","hf_subset":"ecthr_b","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"lexglue:eurlex","suite":["helm","lex_glue_scenario"],"prompt_function":"lex_glue_eurlex","hf_repo":"lighteval\/lexglue","hf_subset":"eurlex","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"lexglue:ledgar","suite":["helm","lex_glue_scenario"],"prompt_function":"lex_glue_ledgar","hf_repo":"lighteval\/lexglue","hf_subset":"ledgar","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"lexglue:scotus","suite":["helm","lex_glue_scenario"],"prompt_function":"lex_glue_scotus","hf_repo":"lighteval\/lexglue","hf_subset":"scotus","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"lexglue:unfair_tos","suite":["helm","lex_glue_scenario"],"prompt_function":"lex_glue_unfair_tos","hf_repo":"lighteval\/lexglue","hf_subset":"unfair_tos","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"lextreme:brazilian_court_decisions_judgment","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_brazilian_court_decisions_judgment","hf_repo":"lighteval\/lextreme","hf_subset":"brazilian_court_decisions_judgment","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"lextreme:brazilian_court_decisions_unanimity","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_brazilian_court_decisions_unanimity","hf_repo":"lighteval\/lextreme","hf_subset":"brazilian_court_decisions_unanimity","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"lextreme:covid19_emergency_event","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_covid19_emergency_event","hf_repo":"lighteval\/lextreme","hf_subset":"covid19_emergency_event","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"lextreme:german_argument_mining","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_german_argument_mining","hf_repo":"lighteval\/lextreme","hf_subset":"german_argument_mining","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"lextreme:greek_legal_code_chapter","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_greek_legal_code_chapter","hf_repo":"lighteval\/lextreme","hf_subset":"greek_legal_code_chapter","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"lextreme:greek_legal_code_subject","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_greek_legal_code_subject","hf_repo":"lighteval\/lextreme","hf_subset":"greek_legal_code_subject","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"lextreme:greek_legal_code_volume","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_greek_legal_code_volume","hf_repo":"lighteval\/lextreme","hf_subset":"greek_legal_code_volume","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","f1_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"lextreme:greek_legal_ner","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_greek_legal_ner","hf_repo":"lighteval\/lextreme","hf_subset":"greek_legal_ner","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":430,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"lextreme:legalnero","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_legalnero","hf_repo":"lighteval\/lextreme","hf_subset":"legalnero","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":788,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"lextreme:lener_br","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_lener_br","hf_repo":"lighteval\/lextreme","hf_subset":"lener_br","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":338,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"lextreme:mapa_coarse","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_mapa_coarse","hf_repo":"lighteval\/lextreme","hf_subset":"mapa_coarse","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":274,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"lextreme:mapa_fine","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_mapa_fine","hf_repo":"lighteval\/lextreme","hf_subset":"mapa_fine","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":274,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"lextreme:multi_eurlex_level_1","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_multi_eurlex_level_1","hf_repo":"lighteval\/lextreme","hf_subset":"multi_eurlex_level_1","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"lextreme:multi_eurlex_level_2","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_multi_eurlex_level_2","hf_repo":"lighteval\/lextreme","hf_subset":"multi_eurlex_level_2","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"lextreme:multi_eurlex_level_3","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_multi_eurlex_level_3","hf_repo":"lighteval\/lextreme","hf_subset":"multi_eurlex_level_3","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"lextreme:online_terms_of_service_clause_topics","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_online_terms_of_service_clause_topics","hf_repo":"lighteval\/lextreme","hf_subset":"online_terms_of_service_clause_topics","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"lextreme:online_terms_of_service_unfairness_levels","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_online_terms_of_service_unfairness_levels","hf_repo":"lighteval\/lextreme","hf_subset":"online_terms_of_service_unfairness_levels","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"lextreme:swiss_judgment_prediction","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_swiss_judgment_prediction","hf_repo":"lighteval\/lextreme","hf_subset":"swiss_judgment_prediction","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"linguistic_mappings","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"linguistic_mappings","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"linguistics_puzzles","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_whitespace_after_query","hf_repo":"bigbench","hf_subset":"linguistics_puzzles","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["bleu","rouge_t5","perfect_exact_match"],"stop_sequence":null,"output_regex":"[^\\.\\?\\!\\;\\n]+", "trust_dataset": true} -{"name":"logic_grid_puzzle","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"logic_grid_puzzle","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"logical_args","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"logical_args","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"logical_deduction","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_whitespace_after_query","hf_repo":"bigbench","hf_subset":"logical_deduction","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"logical_fallacy_detection","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"logical_fallacy_detection","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"logical_sequence","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"logical_sequence","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"logiqa","suite":["lighteval"],"prompt_function":"logiqa","hf_repo":"lighteval/logiqa_harness","hf_subset":"logiqa","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"lsat_qa","suite":["helm","lsat_qa_scenario"],"prompt_function":"lsat_qa","hf_repo":"lighteval\/lsat_qa","hf_subset":"all","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"lsat_qa:assignment","suite":["helm","lsat_qa_scenario"],"prompt_function":"lsat_qa","hf_repo":"lighteval\/lsat_qa","hf_subset":"assignment","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"lsat_qa:grouping","suite":["helm","lsat_qa_scenario"],"prompt_function":"lsat_qa","hf_repo":"lighteval\/lsat_qa","hf_subset":"grouping","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"lsat_qa:miscellaneous","suite":["helm","lsat_qa_scenario"],"prompt_function":"lsat_qa","hf_repo":"lighteval\/lsat_qa","hf_subset":"miscellaneous","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"lsat_qa:ordering","suite":["helm","lsat_qa_scenario"],"prompt_function":"lsat_qa","hf_repo":"lighteval\/lsat_qa","hf_subset":"ordering","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"math:algebra","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"algebra","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"math:counting_and_probability","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"counting_and_probability","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"math:geometry","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"geometry","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"math:intermediate_algebra","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"intermediate_algebra","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"math:number_theory","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"number_theory","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"math:prealgebra","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"prealgebra","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"math:precalculus","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"precalculus","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mathematical_induction","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"mathematical_induction","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mathqa","suite":["lighteval"],"prompt_function":"mathqa","hf_repo":"math_qa","hf_subset":"default","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"matrixshapes","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"matrixshapes","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"me_q_sum","suite":["helm"],"prompt_function":"me_q_sum","hf_repo":"lighteval\/me_q_sum","hf_subset":"default","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":128,"metric":["exact_match","quasi_exact_match","f1_score","rougeL","bleu_1","bleu_4"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"med_dialog:healthcaremagic","suite":["helm"],"prompt_function":"med_dialog","hf_repo":"lighteval\/med_dialog","hf_subset":"healthcaremagic","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":128,"metric":["exact_match","quasi_exact_match","f1_score","rougeL","bleu_1","bleu_4"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"med_dialog:icliniq","suite":["helm"],"prompt_function":"med_dialog","hf_repo":"lighteval\/med_dialog","hf_subset":"icliniq","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":128,"metric":["exact_match","quasi_exact_match","f1_score","rougeL","bleu_1","bleu_4"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"med_mcqa","suite":["helm"],"prompt_function":"med_mcqa","hf_repo":"lighteval\/med_mcqa","hf_subset":"default","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["loglikelihood_acc","exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"med_paragraph_simplification","suite":["helm"],"prompt_function":"med_paragraph_simplification","hf_repo":"lighteval\/med_paragraph_simplification","hf_subset":"default","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":512,"metric":["exact_match","quasi_exact_match","f1_score","rougeL","bleu_1","bleu_4"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"med_qa","suite":["helm"],"prompt_function":"med_qa","hf_repo":"bigbio\/med_qa","hf_subset":"med_qa_en_source","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["loglikelihood_acc","exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"metaphor_boolean","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"metaphor_boolean","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"metaphor_understanding","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"metaphor_understanding","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mgsm:en","suite":["lighteval"],"prompt_function":"mgsm_en","hf_repo":"juletxara/mgsm","hf_subset":"en","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match", "quasi_exact_match"],"stop_sequence":["\n", ":", "Question:"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mgsm:es","suite":["lighteval"],"prompt_function":"mgsm_es","hf_repo":"juletxara/mgsm","hf_subset":"es","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match", "quasi_exact_match"],"stop_sequence":["\n", ":", "Pregunta:"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mgsm:fr","suite":["lighteval"],"prompt_function":"mgsm_fr","hf_repo":"juletxara/mgsm","hf_subset":"fr","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match", "quasi_exact_match"],"stop_sequence":["\n", ":", "Question:"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mgsm:de","suite":["lighteval"],"prompt_function":"mgsm_de","hf_repo":"juletxara/mgsm","hf_subset":"de","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match", "quasi_exact_match"],"stop_sequence":["\n", ":", "Frage:"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mgsm:ru","suite":["lighteval"],"prompt_function":"mgsm_ru","hf_repo":"juletxara/mgsm","hf_subset":"ru","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match", "quasi_exact_match"],"stop_sequence":["\n", ":", "\u0417\u0430\u0434\u0430\u0447\u0430:"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mgsm:zh","suite":["lighteval"],"prompt_function":"mgsm_zh","hf_repo":"juletxara/mgsm","hf_subset":"zh","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match", "quasi_exact_match"],"stop_sequence":["\n", ":", "\u95ee\u9898:"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mgsm:ja","suite":["lighteval"],"prompt_function":"mgsm_ja","hf_repo":"juletxara/mgsm","hf_subset":"ja","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match", "quasi_exact_match"],"stop_sequence":["\n", ":", "\u554f\u984c:"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mgsm:th","suite":["lighteval"],"prompt_function":"mgsm_th","hf_repo":"juletxara/mgsm","hf_subset":"th","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match", "quasi_exact_match"],"stop_sequence":["\n", ":", "\u0e42\u0e08\u0e17\u0e22\u0e4c:"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mgsm:sw","suite":["lighteval"],"prompt_function":"mgsm_sw","hf_repo":"juletxara/mgsm","hf_subset":"sw","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match", "quasi_exact_match"],"stop_sequence":["\n", ":", "Swali:"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mgsm:bn","suite":["lighteval"],"prompt_function":"mgsm_bn","hf_repo":"juletxara/mgsm","hf_subset":"bn","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match", "quasi_exact_match"],"stop_sequence":["\n", ":", "\u09aa\u09cd\u09b0\u09b6\u09cd\u09a8:"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mgsm:te","suite":["lighteval"],"prompt_function":"mgsm_te","hf_repo":"juletxara/mgsm","hf_subset":"te","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match", "quasi_exact_match"],"stop_sequence":["\n", ":", "\u0c2a\u0c4d\u0c30\u0c36\u0c4d\u0c28:"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"minute_mysteries_qa","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"minute_mysteries_qa","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","rouge_t5"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"misconceptions","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"misconceptions","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"misconceptions_russian","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"misconceptions_russian","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"all","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu","suite":["original"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"all","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":5,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:abstract_algebra","suite":["original","mmlu"],"prompt_function":"mmlu_abstract_algebra","hf_repo":"cais\/mmlu","hf_subset":"abstract_algebra","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:abstract_algebra","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"abstract_algebra","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:abstract_algebra","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"abstract_algebra","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:anatomy","suite":["original","mmlu"],"prompt_function":"mmlu_anatomy","hf_repo":"cais\/mmlu","hf_subset":"anatomy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:anatomy","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"anatomy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:anatomy","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"anatomy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:astronomy","suite":["original","mmlu"],"prompt_function":"mmlu_astronomy","hf_repo":"cais\/mmlu","hf_subset":"astronomy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:astronomy","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"astronomy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:astronomy","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"astronomy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:business_ethics","suite":["original","mmlu"],"prompt_function":"mmlu_business_ethics","hf_repo":"cais\/mmlu","hf_subset":"business_ethics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:business_ethics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"business_ethics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:business_ethics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"business_ethics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:clinical_knowledge","suite":["original","mmlu"],"prompt_function":"mmlu_clinical_knowledge","hf_repo":"cais\/mmlu","hf_subset":"clinical_knowledge","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:clinical_knowledge","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"clinical_knowledge","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:clinical_knowledge","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"clinical_knowledge","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:college_biology","suite":["original","mmlu"],"prompt_function":"mmlu_college_biology","hf_repo":"cais\/mmlu","hf_subset":"college_biology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:college_biology","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"college_biology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:college_biology","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"college_biology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:college_chemistry","suite":["original","mmlu"],"prompt_function":"mmlu_college_chemistry","hf_repo":"cais\/mmlu","hf_subset":"college_chemistry","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:college_chemistry","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"college_chemistry","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:college_chemistry","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"college_chemistry","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:college_computer_science","suite":["original","mmlu"],"prompt_function":"mmlu_college_computer_science","hf_repo":"cais\/mmlu","hf_subset":"college_computer_science","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:college_computer_science","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"college_computer_science","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:college_computer_science","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"college_computer_science","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:college_mathematics","suite":["original","mmlu"],"prompt_function":"mmlu_college_mathematics","hf_repo":"cais\/mmlu","hf_subset":"college_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:college_mathematics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"college_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:college_mathematics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"college_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:college_medicine","suite":["original","mmlu"],"prompt_function":"mmlu_college_medicine","hf_repo":"cais\/mmlu","hf_subset":"college_medicine","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:college_medicine","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"college_medicine","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:college_medicine","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"college_medicine","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:college_physics","suite":["original","mmlu"],"prompt_function":"mmlu_college_physics","hf_repo":"cais\/mmlu","hf_subset":"college_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:college_physics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"college_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:college_physics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"college_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:computer_security","suite":["original","mmlu"],"prompt_function":"mmlu_computer_security","hf_repo":"cais\/mmlu","hf_subset":"computer_security","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:computer_security","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"computer_security","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:computer_security","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"computer_security","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:conceptual_physics","suite":["original","mmlu"],"prompt_function":"mmlu_conceptual_physics","hf_repo":"cais\/mmlu","hf_subset":"conceptual_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:conceptual_physics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"conceptual_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:conceptual_physics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"conceptual_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:econometrics","suite":["original","mmlu"],"prompt_function":"mmlu_econometrics","hf_repo":"cais\/mmlu","hf_subset":"econometrics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:econometrics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"econometrics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:econometrics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"econometrics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:electrical_engineering","suite":["original","mmlu"],"prompt_function":"mmlu_electrical_engineering","hf_repo":"cais\/mmlu","hf_subset":"electrical_engineering","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:electrical_engineering","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"electrical_engineering","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:electrical_engineering","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"electrical_engineering","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:elementary_mathematics","suite":["original","mmlu"],"prompt_function":"mmlu_elementary_mathematics","hf_repo":"cais\/mmlu","hf_subset":"elementary_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:elementary_mathematics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"elementary_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:elementary_mathematics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"elementary_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:formal_logic","suite":["original","mmlu"],"prompt_function":"mmlu_formal_logic","hf_repo":"cais\/mmlu","hf_subset":"formal_logic","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:formal_logic","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"formal_logic","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:formal_logic","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"formal_logic","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:global_facts","suite":["original","mmlu"],"prompt_function":"mmlu_global_facts","hf_repo":"cais\/mmlu","hf_subset":"global_facts","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:global_facts","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"global_facts","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:global_facts","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"global_facts","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_biology","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_biology","hf_repo":"cais\/mmlu","hf_subset":"high_school_biology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_biology","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_biology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_biology","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_biology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_chemistry","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_chemistry","hf_repo":"cais\/mmlu","hf_subset":"high_school_chemistry","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_chemistry","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_chemistry","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_chemistry","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_chemistry","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_computer_science","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_computer_science","hf_repo":"cais\/mmlu","hf_subset":"high_school_computer_science","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_computer_science","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_computer_science","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_computer_science","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_computer_science","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_european_history","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_european_history","hf_repo":"cais\/mmlu","hf_subset":"high_school_european_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_european_history","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_european_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_european_history","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_european_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_geography","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_geography","hf_repo":"cais\/mmlu","hf_subset":"high_school_geography","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_geography","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_geography","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_geography","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_geography","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_government_and_politics","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_government_and_politics","hf_repo":"cais\/mmlu","hf_subset":"high_school_government_and_politics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_government_and_politics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_government_and_politics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_government_and_politics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_government_and_politics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_macroeconomics","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_macroeconomics","hf_repo":"cais\/mmlu","hf_subset":"high_school_macroeconomics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_macroeconomics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_macroeconomics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_macroeconomics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_macroeconomics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_mathematics","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_mathematics","hf_repo":"cais\/mmlu","hf_subset":"high_school_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_mathematics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_mathematics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_microeconomics","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_microeconomics","hf_repo":"cais\/mmlu","hf_subset":"high_school_microeconomics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_microeconomics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_microeconomics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_microeconomics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_microeconomics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_physics","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_physics","hf_repo":"cais\/mmlu","hf_subset":"high_school_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_physics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_physics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_psychology","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_psychology","hf_repo":"cais\/mmlu","hf_subset":"high_school_psychology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_psychology","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_psychology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_psychology","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_psychology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_statistics","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_statistics","hf_repo":"cais\/mmlu","hf_subset":"high_school_statistics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_statistics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_statistics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_statistics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_statistics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_us_history","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_us_history","hf_repo":"cais\/mmlu","hf_subset":"high_school_us_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_us_history","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_us_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_us_history","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_us_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_world_history","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_world_history","hf_repo":"cais\/mmlu","hf_subset":"high_school_world_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_world_history","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_world_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_world_history","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_world_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:human_aging","suite":["original","mmlu"],"prompt_function":"mmlu_human_aging","hf_repo":"cais\/mmlu","hf_subset":"human_aging","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:human_aging","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"human_aging","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:human_aging","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"human_aging","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:human_sexuality","suite":["original","mmlu"],"prompt_function":"mmlu_human_sexuality","hf_repo":"cais\/mmlu","hf_subset":"human_sexuality","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:human_sexuality","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"human_sexuality","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:human_sexuality","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"human_sexuality","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:international_law","suite":["original","mmlu"],"prompt_function":"mmlu_international_law","hf_repo":"cais\/mmlu","hf_subset":"international_law","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:international_law","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"international_law","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:international_law","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"international_law","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:jurisprudence","suite":["original","mmlu"],"prompt_function":"mmlu_jurisprudence","hf_repo":"cais\/mmlu","hf_subset":"jurisprudence","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:jurisprudence","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"jurisprudence","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:jurisprudence","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"jurisprudence","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:logical_fallacies","suite":["original","mmlu"],"prompt_function":"mmlu_logical_fallacies","hf_repo":"cais\/mmlu","hf_subset":"logical_fallacies","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:logical_fallacies","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"logical_fallacies","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:logical_fallacies","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"logical_fallacies","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:machine_learning","suite":["original","mmlu"],"prompt_function":"mmlu_machine_learning","hf_repo":"cais\/mmlu","hf_subset":"machine_learning","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:machine_learning","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"machine_learning","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:machine_learning","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"machine_learning","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:management","suite":["original","mmlu"],"prompt_function":"mmlu_management","hf_repo":"cais\/mmlu","hf_subset":"management","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:management","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"management","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:management","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"management","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:marketing","suite":["original","mmlu"],"prompt_function":"mmlu_marketing","hf_repo":"cais\/mmlu","hf_subset":"marketing","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:marketing","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"marketing","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:marketing","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"marketing","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:medical_genetics","suite":["original","mmlu"],"prompt_function":"mmlu_medical_genetics","hf_repo":"cais\/mmlu","hf_subset":"medical_genetics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:medical_genetics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"medical_genetics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:medical_genetics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"medical_genetics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:miscellaneous","suite":["original","mmlu"],"prompt_function":"mmlu_miscellaneous","hf_repo":"cais\/mmlu","hf_subset":"miscellaneous","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:miscellaneous","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"miscellaneous","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:miscellaneous","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"miscellaneous","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:moral_disputes","suite":["original","mmlu"],"prompt_function":"mmlu_moral_disputes","hf_repo":"cais\/mmlu","hf_subset":"moral_disputes","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:moral_disputes","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"moral_disputes","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:moral_disputes","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"moral_disputes","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:moral_scenarios","suite":["original","mmlu"],"prompt_function":"mmlu_moral_scenarios","hf_repo":"cais\/mmlu","hf_subset":"moral_scenarios","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:moral_scenarios","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"moral_scenarios","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:moral_scenarios","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"moral_scenarios","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:nutrition","suite":["original","mmlu"],"prompt_function":"mmlu_nutrition","hf_repo":"cais\/mmlu","hf_subset":"nutrition","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:nutrition","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"nutrition","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:nutrition","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"nutrition","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:philosophy","suite":["original","mmlu"],"prompt_function":"mmlu_philosophy","hf_repo":"cais\/mmlu","hf_subset":"philosophy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:philosophy","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"philosophy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:philosophy","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"philosophy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:prehistory","suite":["original","mmlu"],"prompt_function":"mmlu_prehistory","hf_repo":"cais\/mmlu","hf_subset":"prehistory","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:prehistory","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"prehistory","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:prehistory","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"prehistory","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:professional_accounting","suite":["original","mmlu"],"prompt_function":"mmlu_professional_accounting","hf_repo":"cais\/mmlu","hf_subset":"professional_accounting","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:professional_accounting","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"professional_accounting","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:professional_accounting","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"professional_accounting","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:professional_law","suite":["original","mmlu"],"prompt_function":"mmlu_professional_law","hf_repo":"cais\/mmlu","hf_subset":"professional_law","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:professional_law","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"professional_law","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:professional_law","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"professional_law","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:professional_medicine","suite":["original","mmlu"],"prompt_function":"mmlu_professional_medicine","hf_repo":"cais\/mmlu","hf_subset":"professional_medicine","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:professional_medicine","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"professional_medicine","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:professional_medicine","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"professional_medicine","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:professional_psychology","suite":["original","mmlu"],"prompt_function":"mmlu_professional_psychology","hf_repo":"cais\/mmlu","hf_subset":"professional_psychology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:professional_psychology","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"professional_psychology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:professional_psychology","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"professional_psychology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:public_relations","suite":["original","mmlu"],"prompt_function":"mmlu_public_relations","hf_repo":"cais\/mmlu","hf_subset":"public_relations","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:public_relations","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"public_relations","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:public_relations","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"public_relations","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:security_studies","suite":["original","mmlu"],"prompt_function":"mmlu_security_studies","hf_repo":"cais\/mmlu","hf_subset":"security_studies","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:security_studies","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"security_studies","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:security_studies","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"security_studies","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:sociology","suite":["original","mmlu"],"prompt_function":"mmlu_sociology","hf_repo":"cais\/mmlu","hf_subset":"sociology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:sociology","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"sociology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:sociology","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"sociology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:us_foreign_policy","suite":["original","mmlu"],"prompt_function":"mmlu_us_foreign_policy","hf_repo":"cais\/mmlu","hf_subset":"us_foreign_policy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:us_foreign_policy","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"us_foreign_policy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:us_foreign_policy","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"us_foreign_policy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:virology","suite":["original","mmlu"],"prompt_function":"mmlu_virology","hf_repo":"cais\/mmlu","hf_subset":"virology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:virology","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"virology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:virology","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"virology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:world_religions","suite":["original","mmlu"],"prompt_function":"mmlu_world_religions","hf_repo":"cais\/mmlu","hf_subset":"world_religions","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:world_religions","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"world_religions","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:world_religions","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"world_religions","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mnist_ascii","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"mnist_ascii","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"modified_arithmetic","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"modified_arithmetic","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"moral_permissibility","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"moral_permissibility","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"movie_dialog_same_or_different","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"movie_dialog_same_or_different","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"movie_recommendation","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"movie_recommendation","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mtnt2019:en-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"mtnt2019_en-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":200,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mtnt2019:en-ja","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"mtnt2019_en-ja","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":200,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mtnt2019:fr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"mtnt2019_fr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":200,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mtnt2019:ja-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"mtnt2019_ja-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":200,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mult_data_wrangling","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"mult_data_wrangling","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"multiemo","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"multiemo","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mutual","suite":["lighteval"],"prompt_function":"mutual","hf_repo":"lighteval\/mutual_harness","hf_subset":"mutual","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["recall_at_1","recall_at_2","mrr"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mutual_plus","suite":["lighteval"],"prompt_function":"mutual","hf_repo":"lighteval\/mutual_harness","hf_subset":"mutual_plus","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["recall_at_1","recall_at_2","mrr"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"narrativeqa","suite":["helm","helm_general"],"prompt_function":"narrativeqa","hf_repo":"lighteval/narrative_qa_helm","hf_subset":"default","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","f1_score","rougeL","bleu_1","bleu_4"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"natural_instructions","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"natural_instructions","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","rouge_t5"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"navigate","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"navigate","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"nonsense_words_grammar","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"nonsense_words_grammar","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"novel_concepts","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"novel_concepts","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"numeracy:linear_example","suite":["helm"],"prompt_function":"numeracy","hf_repo":"lighteval\/numeracy","hf_subset":"linear_example","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"numeracy:linear_standard","suite":["helm"],"prompt_function":"numeracy","hf_repo":"lighteval\/numeracy","hf_subset":"linear_standard","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"numeracy:parabola_example","suite":["helm"],"prompt_function":"numeracy","hf_repo":"lighteval\/numeracy","hf_subset":"parabola_example","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"numeracy:parabola_standard","suite":["helm"],"prompt_function":"numeracy","hf_repo":"lighteval\/numeracy","hf_subset":"parabola_standard","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"numeracy:paraboloid_example","suite":["helm"],"prompt_function":"numeracy","hf_repo":"lighteval\/numeracy","hf_subset":"paraboloid_example","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"numeracy:paraboloid_standard","suite":["helm"],"prompt_function":"numeracy","hf_repo":"lighteval\/numeracy","hf_subset":"paraboloid_standard","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"numeracy:plane_example","suite":["helm"],"prompt_function":"numeracy","hf_repo":"lighteval\/numeracy","hf_subset":"plane_example","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"numeracy:plane_standard","suite":["helm"],"prompt_function":"numeracy","hf_repo":"lighteval\/numeracy","hf_subset":"plane_standard","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"object_counting","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"object_counting","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"odd_one_out","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"odd_one_out","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"openbookqa","suite":["helm","commonsense_scenario","helm_general"],"prompt_function":"openbookqa_helm","hf_repo":"openbookqa","hf_subset":"main","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"openbookqa","suite":["lighteval"],"prompt_function":"openbookqa","hf_repo":"openbookqa","hf_subset":"main","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"operators","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_whitespace_after_query","hf_repo":"bigbench","hf_subset":"operators","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":"([-+]?\\d+)[.]{0,1}$", "trust_dataset": true} -{"name":"paragraph_segmentation","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"paragraph_segmentation","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"parsinlu_qa","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"parsinlu_qa","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"parsinlu_reading_comprehension","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_whitespace_after_query","hf_repo":"bigbench","hf_subset":"parsinlu_reading_comprehension","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["perfect_exact_match"],"stop_sequence":null,"output_regex":"[^\\.\\?\\!\\;\\n]+", "trust_dataset": true} -{"name":"penguins_in_a_table","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"penguins_in_a_table","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"periodic_elements","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"periodic_elements","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"persian_idioms","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"persian_idioms","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"phrase_relatedness","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"phrase_relatedness","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"physical_intuition","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"physical_intuition","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"physics","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"physics","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"physics_questions","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"physics_questions","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"piqa","suite":["lighteval"],"prompt_function":"piqa_harness","hf_repo":"piqa","hf_subset":"plain_text","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"piqa","suite":["helm","commonsense_scenario"],"prompt_function":"piqa_helm","hf_repo":"piqa","hf_subset":"plain_text","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"play_dialog_same_or_different","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_whitespace_after_query","hf_repo":"bigbench","hf_subset":"play_dialog_same_or_different","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"polish_sequence_labeling","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"polish_sequence_labeling","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["f1_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"presuppositions_as_nli","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"presuppositions_as_nli","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"prost","suite":["lighteval"],"prompt_function":"prost","hf_repo":"corypaik\/prost","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"pubmedqa","suite":["lighteval"],"prompt_function":"pubmed_qa","hf_repo":"pubmed_qa","hf_subset":"pqa_labeled","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"pubmedqa","suite":["helm"],"prompt_function":"pubmed_qa_helm","hf_repo":"pubmed_qa","hf_subset":"pqa_labeled","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"qa4mre:2011","suite":["lighteval"],"prompt_function":"qa4mre","hf_repo":"qa4mre","hf_subset":"2011.main.EN","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"qa4mre:2012","suite":["lighteval"],"prompt_function":"qa4mre","hf_repo":"qa4mre","hf_subset":"2012.main.EN","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"qa4mre:2013","suite":["lighteval"],"prompt_function":"qa4mre","hf_repo":"qa4mre","hf_subset":"2013.main.EN","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"qa_wikidata","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"qa_wikidata","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleurt","bleu","rouge_t5","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"qasper","suite":["lighteval"],"prompt_function":"qasper","hf_repo":"qasper","hf_subset":"qasper","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["f1_score_quasi"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"qasper_ll","suite":["lighteval"],"prompt_function":"qasper_ll","hf_repo":"qasper","hf_subset":"qasper","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"quac","suite":["helm"],"prompt_function":"quac","hf_repo":"lighteval/quac_helm","hf_subset":"default","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match", "quasi_exact_match", "f1_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"question_selection","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"question_selection","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"race:high","suite":["lighteval","race"],"prompt_function":"race","hf_repo":"EleutherAI/race","hf_subset":"high","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"raft:ade_corpus_v2","suite":["helm","helm_general"],"prompt_function":"raft_ade_corpus_v2","hf_repo":"ought\/raft","hf_subset":"ade_corpus_v2","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":30,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"raft:banking_77","suite":["helm","helm_general"],"prompt_function":"raft_banking_77","hf_repo":"ought\/raft","hf_subset":"banking_77","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":30,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"raft:neurips_impact_statement_risks","suite":["helm","helm_general"],"prompt_function":"raft_neurips_impact_statement_risks","hf_repo":"ought\/raft","hf_subset":"neurips_impact_statement_risks","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":30,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"raft:one_stop_english","suite":["helm","helm_general"],"prompt_function":"raft_one_stop_english","hf_repo":"ought\/raft","hf_subset":"one_stop_english","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":30,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"raft:overruling","suite":["helm","helm_general"],"prompt_function":"raft_overruling","hf_repo":"ought\/raft","hf_subset":"overruling","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":30,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"raft:semiconductor_org_types","suite":["helm","helm_general"],"prompt_function":"raft_semiconductor_org_types","hf_repo":"ought\/raft","hf_subset":"semiconductor_org_types","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":30,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"raft:systematic_review_inclusion","suite":["helm","helm_general"],"prompt_function":"raft_systematic_review_inclusion","hf_repo":"ought\/raft","hf_subset":"systematic_review_inclusion","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":30,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"raft:tai_safety_research","suite":["helm","helm_general"],"prompt_function":"raft_tai_safety_research","hf_repo":"ought\/raft","hf_subset":"tai_safety_research","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":30,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"raft:terms_of_service","suite":["helm","helm_general"],"prompt_function":"raft_terms_of_service","hf_repo":"ought\/raft","hf_subset":"terms_of_service","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":30,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"raft:tweet_eval_hate","suite":["helm","helm_general"],"prompt_function":"raft_tweet_eval_hate","hf_repo":"ought\/raft","hf_subset":"tweet_eval_hate","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":30,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"raft:twitter_complaints","suite":["helm","helm_general"],"prompt_function":"raft_twitter_complaints","hf_repo":"ought\/raft","hf_subset":"twitter_complaints","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":30,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"real_or_fake_text","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"real_or_fake_text","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"real_toxicity_prompts","suite":["helm"],"prompt_function":"real_toxicity_prompts","hf_repo":"allenai\/real-toxicity-prompts","hf_subset":"default","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["prediction_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"reasoning_about_colored_objects","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"reasoning_about_colored_objects","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"repeat_copy_logic","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_whitespace_after_query","hf_repo":"bigbench","hf_subset":"repeat_copy_logic","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"rephrase","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"rephrase","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["rouge_t5","bleu","loglikelihood_acc","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"rhyming","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"rhyming","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"riddle_sense","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"riddle_sense","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"ruin_names","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"ruin_names","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"salient_translation_error_detection","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"salient_translation_error_detection","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"scientific_press_release","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"scientific_press_release","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"sciq","suite":["lighteval"],"prompt_function":"sciq","hf_repo":"sciq","hf_subset":"default","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"semantic_parsing_in_context_sparc","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"semantic_parsing_in_context_sparc","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","rouge_t5","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"semantic_parsing_spider","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"semantic_parsing_spider","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","rouge_t5","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"sentence_ambiguity","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"sentence_ambiguity","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"similarities_abstraction","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"similarities_abstraction","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","rouge_t5","loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"simp_turing_concept","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"simp_turing_concept","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"simple_arithmetic_json","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"simple_arithmetic_json","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"simple_arithmetic_json_multiple_choice","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"simple_arithmetic_json_multiple_choice","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"simple_arithmetic_json_subtasks","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"simple_arithmetic_json_subtasks","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"simple_arithmetic_multiple_targets_json","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"simple_arithmetic_multiple_targets_json","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"simple_ethical_questions","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"simple_ethical_questions","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"simple_text_editing","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"simple_text_editing","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"siqa","suite":["helm","commonsense_scenario"],"prompt_function":"siqa","hf_repo":"social_i_qa","hf_subset":"default","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"snarks","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"snarks","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"social_iqa","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"social_iqa","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"social_support","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"social_support","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["f1_score_macro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"sports_understanding","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"sports_understanding","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"storycloze:2016","suite":["lighteval","storycloze"],"prompt_function":"storycloze","hf_repo":"story_cloze","hf_subset":"2016","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"storycloze:2018","suite":["lighteval","storycloze"],"prompt_function":"storycloze","hf_repo":"story_cloze","hf_subset":"2018","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"strange_stories","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_whitespace_after_query","hf_repo":"bigbench","hf_subset":"strange_stories","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"strategyqa","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_whitespace_after_query","hf_repo":"bigbench","hf_subset":"strategyqa","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","rouge_t5","loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"sufficient_information","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"sufficient_information","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"suicide_risk","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"suicide_risk","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"summarization:cnn-dm","suite":["helm","helm_general"],"prompt_function":"cnn_dm","hf_repo":"lighteval\/summarization","hf_subset":"cnn-dm","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":128,"metric":["rouge1","rouge2","rougeL","faithfulness","extractiveness","bert_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"summarization:xsum","suite":["helm","helm_general"],"prompt_function":"xsum","hf_repo":"lighteval\/summarization","hf_subset":"xsum","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":64,"metric":["rouge1","rouge2","rougeL","faithfulness","extractiveness","bert_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"summarization:xsum-sampled","suite":["helm"],"prompt_function":"xsum","hf_repo":"lighteval\/summarization","hf_subset":"xsum-sampled","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":64,"metric":["rouge1","rouge2","rougeL","faithfulness","extractiveness","bert_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"super_glue:boolq","suite":["lighteval","superglue"],"prompt_function":"boolq_harness","hf_repo":"super_glue","hf_subset":"boolq","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"super_glue:cb","suite":["lighteval","superglue"],"prompt_function":"cb","hf_repo":"super_glue","hf_subset":"cb","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc_single_token", "multi_f1_numeric"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"super_glue:copa","suite":["lighteval","superglue"],"prompt_function":"copa","hf_repo":"super_glue","hf_subset":"copa","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"super_glue:rte","suite":["lighteval","superglue"],"prompt_function":"rte","hf_repo":"super_glue","hf_subset":"rte","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"super_glue:multirc","suite":["lighteval","superglue"],"prompt_function":"multirc","hf_repo":"super_glue","hf_subset":"multirc","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"super_glue:wic","suite":["lighteval","superglue"],"prompt_function":"wic","hf_repo":"super_glue","hf_subset":"wic","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"super_glue:wsc","suite":["lighteval","superglue"],"prompt_function":"wsc","hf_repo":"super_glue","hf_subset":"wsc","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"swahili_english_proverbs","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"swahili_english_proverbs","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"swag","suite":["lighteval"],"prompt_function":"swag","hf_repo":"swag","hf_subset":"regular","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"swedish_to_german_proverbs","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"swedish_to_german_proverbs","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"symbol_interpretation","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_whitespace_after_query","hf_repo":"bigbench","hf_subset":"symbol_interpretation","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"synthetic_reasoning:induction","suite":["helm"],"prompt_function":"synthetic_reasoning","hf_repo":"lighteval\/synthetic_reasoning","hf_subset":"induction","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":50,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"synthetic_reasoning:natural_easy","suite":["helm"],"prompt_function":"synthetic_reasoning_natural","hf_repo":"lighteval\/synthetic_reasoning_natural","hf_subset":"easy","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","f1_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"synthetic_reasoning:natural_hard","suite":["helm"],"prompt_function":"synthetic_reasoning_natural","hf_repo":"lighteval\/synthetic_reasoning_natural","hf_subset":"hard","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","f1_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"synthetic_reasoning:pattern_match","suite":["helm"],"prompt_function":"synthetic_reasoning","hf_repo":"lighteval\/synthetic_reasoning","hf_subset":"pattern_match","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":50,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"synthetic_reasoning:variable_substitution","suite":["helm"],"prompt_function":"synthetic_reasoning","hf_repo":"lighteval\/synthetic_reasoning","hf_subset":"variable_substitution","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":50,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"tellmewhy","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"tellmewhy","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","rouge_t5"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"temporal_sequences","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"temporal_sequences","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"tense","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"tense","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"the_pile:arxiv","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_arxiv","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"the_pile:arxiv","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"arxiv","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"the_pile:bibliotik","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"bibliotik","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"the_pile:bookcorpus2","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_bookcorpus2","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"the_pile:books3","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_books3","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"the_pile:commoncrawl","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"commoncrawl","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"the_pile:dm-mathematics","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_dm-mathematics","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"the_pile:dm-mathematics","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"dm-mathematics","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"the_pile:enron","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_enron","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"the_pile:enron","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"enron","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"the_pile:europarl","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_europarl","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"the_pile:europarl","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"europarl","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"the_pile:freelaw","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_freelaw","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"the_pile:freelaw","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"freelaw","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"the_pile:github","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_github","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"the_pile:github","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"github","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"the_pile:gutenberg","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_gutenberg","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"the_pile:gutenberg","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"gutenberg","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"the_pile:hackernews","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_hackernews","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"the_pile:hackernews","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"hackernews","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"the_pile:nih-exporter","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_nih-exporter","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"the_pile:nih-exporter","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"nih-exporter","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"the_pile:opensubtitles","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_opensubtitles","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"the_pile:opensubtitles","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"opensubtitles","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"the_pile:openwebtext2","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_openwebtext2","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"the_pile:openwebtext2","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"openwebtext2","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"the_pile:philpapers","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_philpapers","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"the_pile:pile-cc","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_pile-cc","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"the_pile:pubmed-abstracts","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_pubmed-abstracts","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"the_pile:pubmed-abstracts","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"pubmed-abstracts","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"the_pile:pubmed-central","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_pubmed-central","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"the_pile:pubmed-central","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"pubmed-central","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"the_pile:stackexchange","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_stackexchange","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"the_pile:stackexchange","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"stackexchange","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"the_pile:ubuntu-irc","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_ubuntu-irc","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"the_pile:uspto","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_upsto","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"the_pile:upsto","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"uspto","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"the_pile:wikipedia","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_wikipedia","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"the_pile:wikipedia","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"wikipedia","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"the_pile:youtubesubtitles","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_youtubesubtitles","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"the_pile:youtubesubtitles","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"youtubesubtitles","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"timedial","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"timedial","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"toxigen","suite":["lighteval"],"prompt_function":"toxigen","hf_repo":"skg/toxigen-data","hf_subset":"annotated","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"topical_chat","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"topical_chat","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","rouge_t5","loglikelihood_acc","bleurt"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"tracking_shuffled_objects","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"tracking_shuffled_objects","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"triviaqa","suite":["lighteval"],"prompt_function":"triviaqa","hf_repo":"trivia_qa","hf_subset":"rc.nocontext","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["quasi_exact_match_triviaqa"],"stop_sequence":["\n", ".", ","],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"truthfulqa:gen","suite":["lighteval"],"prompt_function":"truthful_qa_generative","hf_repo":"truthful_qa","hf_subset":"generation","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":200,"metric":["bleu","rouge_t5"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"truthfulqa:mc","suite":["leaderboard"],"prompt_function":"truthful_qa_multiple_choice","hf_repo":"truthful_qa","hf_subset":"multiple_choice","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["truthfulqa_mc_metrics"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"truthfulqa","suite":["helm","helm_general"],"prompt_function":"truthful_qa_helm","hf_repo":"lighteval\/truthfulqa_helm","hf_subset":"default","hf_avail_splits":["train","valid"],"evaluation_splits":["valid"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["loglikelihood_acc","exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"twitterAAE:aa","suite":["helm"],"prompt_function":"twitter_aae","hf_repo":"lighteval\/twitterAAE","hf_subset":"aa","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"twitterAAE:white","suite":["helm"],"prompt_function":"twitter_aae","hf_repo":"lighteval\/twitterAAE","hf_subset":"white","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"understanding_fables","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"understanding_fables","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"undo_permutation","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"undo_permutation","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"unit_conversion","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"unit_conversion","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"unit_interpretation","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"unit_interpretation","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"unnatural_in_context_learning","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"unnatural_in_context_learning","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"unscramble:anagrams1","suite":["lighteval","unscramble"],"prompt_function":"unscramble","hf_repo":"lighteval\/GPT3_unscramble","hf_subset":"default","hf_avail_splits":["mid_word_1_anagrams"],"evaluation_splits":["mid_word_1_anagrams"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"unscramble:anagrams2","suite":["lighteval","unscramble"],"prompt_function":"unscramble","hf_repo":"lighteval\/GPT3_unscramble","hf_subset":"default","hf_avail_splits":["mid_word_2_anagrams"],"evaluation_splits":["mid_word_2_anagrams"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"unscramble:cycle_letters","suite":["lighteval","unscramble"],"prompt_function":"unscramble","hf_repo":"lighteval\/GPT3_unscramble","hf_subset":"default","hf_avail_splits":["cycle_letters_in_word"],"evaluation_splits":["cycle_letters_in_word"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"unscramble:random_insertion","suite":["lighteval","unscramble"],"prompt_function":"unscramble","hf_repo":"lighteval\/GPT3_unscramble","hf_subset":"default","hf_avail_splits":["random_insertion_in_word"],"evaluation_splits":["random_insertion_in_word"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"unscramble:reversed_words","suite":["lighteval","unscramble"],"prompt_function":"unscramble","hf_repo":"lighteval\/GPT3_unscramble","hf_subset":"default","hf_avail_splits":["reversed_words"],"evaluation_splits":["reversed_words"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"vitaminc_fact_verification","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_whitespace_after_query","hf_repo":"bigbench","hf_subset":"vitaminc_fact_verification","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"webqs","suite":["lighteval"],"prompt_function":"webqs","hf_repo":"web_questions","hf_subset":"default","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"what_is_the_tao","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"what_is_the_tao","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"which_wiki_edit","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"which_wiki_edit","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:applies_to_jurisdiction","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"applies_to_jurisdiction","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:atomic_number","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"atomic_number","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:author","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"author","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:award_received","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"award_received","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:basic_form_of_government","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"basic_form_of_government","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:capital","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"capital","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:capital_of","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"capital_of","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:central_bank","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"central_bank","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:composer","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"composer","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:continent","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"continent","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:country","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"country","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:country_of_citizenship","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"country_of_citizenship","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:country_of_origin","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"country_of_origin","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:creator","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"creator","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:currency","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"currency","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:defendant","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"defendant","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:developer","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"developer","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:diplomatic_relation","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"diplomatic_relation","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:director","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"director","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:discoverer_or_inventor","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"discoverer_or_inventor","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:drug_or_therapy_used_for_treatment","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"drug_or_therapy_used_for_treatment","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:educated_at","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"educated_at","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:electron_configuration","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"electron_configuration","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:employer","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"employer","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:field_of_work","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"field_of_work","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:file_extension","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"file_extension","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:genetic_association","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"genetic_association","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:genre","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"genre","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:has_part","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"has_part","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:head_of_government","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"head_of_government","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:head_of_state","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"head_of_state","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:headquarters_location","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"headquarters_location","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:industry","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"industry","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:influenced_by","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"influenced_by","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:instance_of","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"instance_of","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:instrument","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"instrument","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:language_of_work_or_name","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"language_of_work_or_name","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:languages_spoken_written_or_signed","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"languages_spoken_written_or_signed","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:laws_applied","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"laws_applied","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:located_in_the_administrative_territorial_entity","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"located_in_the_administrative_territorial_entity","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:location","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"location","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:location_of_discovery","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"location_of_discovery","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:location_of_formation","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"location_of_formation","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:majority_opinion_by","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"majority_opinion_by","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:manufacturer","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"manufacturer","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:measured_physical_quantity","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"measured_physical_quantity","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:medical_condition_treated","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"medical_condition_treated","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:member_of","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"member_of","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:member_of_political_party","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"member_of_political_party","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:member_of_sports_team","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"member_of_sports_team","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:movement","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"movement","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:named_after","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"named_after","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:native_language","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"native_language","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:number_of_processor_cores","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"number_of_processor_cores","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:occupation","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"occupation","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:office_held_by_head_of_government","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"office_held_by_head_of_government","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:office_held_by_head_of_state","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"office_held_by_head_of_state","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:official_language","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"official_language","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:operating_system","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"operating_system","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:original_language_of_film_or_TV_show","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"original_language_of_film_or_TV_show","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:original_network","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"original_network","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:overrules","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"overrules","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:owned_by","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"owned_by","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:part_of","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"part_of","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:participating_team","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"participating_team","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:place_of_birth","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"place_of_birth","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:place_of_death","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"place_of_death","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:plaintiff","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"plaintiff","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:position_held","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"position_held","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:position_played_on_team","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"position_played_on_team","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:programming_language","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"programming_language","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:recommended_unit_of_measurement","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"recommended_unit_of_measurement","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:record_label","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"record_label","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:religion","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"religion","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:repealed_by","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"repealed_by","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:shares_border_with","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"shares_border_with","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:solved_by","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"solved_by","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:statement_describes","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"statement_describes","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:stock_exchange","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"stock_exchange","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:subclass_of","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"subclass_of","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:subsidiary","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"subsidiary","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:symptoms_and_signs","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"symptoms_and_signs","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:therapeutic_area","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"therapeutic_area","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:time_of_discovery_or_invention","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"time_of_discovery_or_invention","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:twinned_administrative_body","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"twinned_administrative_body","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikifact:work_location","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"work_location","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikitext:2","suite":["lighteval"],"prompt_function":"wikitext","hf_repo":"wikitext","hf_subset":"wikitext-2-raw-v1","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikitext:103:document_level","suite":["harness"],"prompt_function":"wikitext_harness","hf_repo":"EleutherAI\/wikitext_document_level","hf_subset":"wikitext-103-raw-v1","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wikitext:103:document_level","suite":["helm"],"prompt_function":"wikitext_helm","hf_repo":"EleutherAI\/wikitext_document_level","hf_subset":"wikitext-103-raw-v1","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wino_x_german","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"wino_x_german","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"winogrande","suite":["leaderboard"],"prompt_function":"winogrande","hf_repo":"winogrande","hf_subset":"winogrande_xl","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"winowhy","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_whitespace_after_query","hf_repo":"bigbench","hf_subset":"winowhy","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt08:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt08_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt08:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt08_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt08:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt08_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt08:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt08_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt08:en-es","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt08_en-es","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt08:en-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt08_en-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt08:en-hu","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt08_en-hu","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt08:es-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt08_es-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt08:fr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt08_fr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt08:hu-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt08_hu-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt09:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt09:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt09:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt09:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt09:en-es","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_en-es","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt09:en-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_en-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt09:en-hu","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_en-hu","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt09:en-it","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_en-it","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt09:es-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_es-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt09:fr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_fr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt09:hu-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_hu-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt09:it-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_it-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt10:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt10_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt10:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt10_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt10:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt10_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt10:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt10_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt10:en-es","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt10_en-es","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt10:en-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt10_en-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt10:es-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt10_es-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt10:fr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt10_fr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt11:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt11_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt11:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt11_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt11:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt11_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt11:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt11_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt11:en-es","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt11_en-es","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt11:en-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt11_en-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt11:es-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt11_es-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt11:fr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt11_fr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt12:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt12_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt12:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt12_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt12:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt12_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt12:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt12_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt12:en-es","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt12_en-es","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt12:en-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt12_en-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt12:es-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt12_es-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt12:fr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt12_fr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt13:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt13_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt13:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt13_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt13:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt13_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt13:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt13_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt13:en-es","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt13_en-es","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt13:en-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt13_en-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt13:en-ru","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt13_en-ru","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt13:es-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt13_es-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt13:fr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt13_fr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt13:ru-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt13_ru-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt14:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt14_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt14:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt14_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt14:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt14_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt14:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt14_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt14:en-fr","suite":["lighteval","gpt3_benchmarks"],"prompt_function":"wmt_alphabetical","hf_repo":"wmt14","hf_subset":"fr-en","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt14:en-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt14_en-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt14:en-hi","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt14_en-hi","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt14:en-ru","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt14_en-ru","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt14:fr-en","suite":["lighteval","gpt3_benchmarks"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"wmt14","hf_subset":"fr-en","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt14:fr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt14_fr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt14:hi-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt14_hi-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt14:ru-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt14_ru-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt14:cs-en","suite":["helm"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/wmt14","hf_subset":"cs-en","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["bleu"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt14:de-en","suite":["helm"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/wmt14","hf_subset":"de-en","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["bleu"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt14:fr-en","suite":["helm"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/wmt14","hf_subset":"fr-en","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["bleu"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt14:hi-en","suite":["helm"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/wmt14","hf_subset":"hi-en","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["bleu"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt14:ru-en","suite":["helm"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/wmt14","hf_subset":"ru-en","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["bleu"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt15:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt15_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt15:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt15_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt15:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt15_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt15:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt15_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt15:en-fi","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt15_en-fi","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt15:en-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt15_en-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt15:en-ru","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt15_en-ru","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt15:fi-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt15_fi-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt15:fr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt15_fr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt15:ru-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt15_ru-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt16:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt16:de-en","suite":["lighteval","gpt3_benchmarks"],"prompt_function":"wmt_alphabetical","hf_repo":"wmt16","hf_subset":"de-en","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt16:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt16:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt16:en-de","suite":["lighteval","gpt3_benchmarks"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"wmt16","hf_subset":"de-en","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt16:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt16:en-fi","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_en-fi","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt16:en-ro","suite":["lighteval","gpt3_benchmarks"],"prompt_function":"wmt_alphabetical","hf_repo":"wmt16","hf_subset":"ro-en","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt16:en-ro","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_en-ro","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt16:en-ru","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_en-ru","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt16:en-tr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_en-tr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt16:fi-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_fi-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt16:ro-en","suite":["lighteval","gpt3_benchmarks"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"wmt16","hf_subset":"ro-en","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt16:ro-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_ro-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt16:ru-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_ru-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt16:tr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_tr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt17:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt17:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt17:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt17:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt17:en-fi","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_en-fi","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt17:en-lv","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_en-lv","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt17:en-ru","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_en-ru","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt17:en-tr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_en-tr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt17:en-zh","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_en-zh","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt17:fi-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_fi-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt17:lv-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_lv-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt17:ru-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_ru-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt17:tr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_tr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt17:zh-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_zh-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt18:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt18:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt18:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt18:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt18:en-et","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_en-et","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt18:en-fi","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_en-fi","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt18:en-ru","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_en-ru","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt18:en-tr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_en-tr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt18:en-zh","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_en-zh","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt18:et-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_et-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt18:fi-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_fi-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt18:ru-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_ru-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt18:tr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_tr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt18:zh-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_zh-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt19:cs-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_cs-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt19:de-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_de-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt19:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt19:de-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_de-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt19:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt19:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt19:en-fi","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_en-fi","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt19:en-gu","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_en-gu","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt19:en-kk","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_en-kk","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt19:en-lt","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_en-lt","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt19:en-ru","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_en-ru","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt19:en-zh","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_en-zh","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt19:fi-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_fi-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt19:fr-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_fr-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt19:gu-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_gu-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt19:kk-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_kk-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt19:lt-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_lt-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt19:ru-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_ru-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt19:zh-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_zh-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt20:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt20:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt20:de-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_de-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt20:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt20:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt20:en-iu","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_en-iu","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt20:en-ja","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_en-ja","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt20:en-km","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_en-km","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt20:en-pl","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_en-pl","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt20:en-ps","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_en-ps","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt20:en-ru","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_en-ru","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt20:en-ta","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_en-ta","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt20:en-zh","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_en-zh","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt20:fr-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_fr-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt20:iu-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_iu-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt20:ja-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_ja-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt20:km-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_km-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt20:pl-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_pl-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt20:ps-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_ps-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt20:ru-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_ru-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt20:ta-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_ta-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wmt20:zh-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_zh-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"word_sorting","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"word_sorting","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"word_unscrambling","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"word_unscrambling","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"wsc273","suite":["lighteval"],"prompt_function":"wsc273","hf_repo":"winograd_wsc","hf_subset":"wsc273","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"xcopa:en","suite":["lighteval"],"prompt_function":"xcopa_en","hf_repo":"xcopa","hf_subset":"default","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"xcopa:et","suite":["lighteval"],"prompt_function":"xcopa_et","hf_repo":"xcopa","hf_subset":"et","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"xcopa:ht","suite":["lighteval"],"prompt_function":"xcopa_ht","hf_repo":"xcopa","hf_subset":"ht","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"xcopa:it","suite":["lighteval"],"prompt_function":"xcopa_it","hf_repo":"xcopa","hf_subset":"it","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"xcopa:id","suite":["lighteval"],"prompt_function":"xcopa_id","hf_repo":"xcopa","hf_subset":"id","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"xcopa:qu","suite":["lighteval"],"prompt_function":"xcopa_qu","hf_repo":"xcopa","hf_subset":"qu","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"xcopa:sw","suite":["lighteval"],"prompt_function":"xcopa_sw","hf_repo":"xcopa","hf_subset":"sw","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"xcopa:zh","suite":["lighteval"],"prompt_function":"xcopa_zh","hf_repo":"xcopa","hf_subset":"zh","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"xcopa:ta","suite":["lighteval"],"prompt_function":"xcopa_ta","hf_repo":"xcopa","hf_subset":"ta","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"xcopa:th","suite":["lighteval"],"prompt_function":"xcopa_th","hf_repo":"xcopa","hf_subset":"th","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"xcopa:tr","suite":["lighteval"],"prompt_function":"xcopa_tr","hf_repo":"xcopa","hf_subset":"tr","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"xcopa:vi","suite":["lighteval"],"prompt_function":"xcopa_vi","hf_repo":"xcopa","hf_subset":"vi","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"xstory_cloze:en","suite":["lighteval"],"prompt_function":"storycloze","hf_repo":"juletxara/xstory_cloze","hf_subset":"en","hf_avail_splits":["training", "eval"],"evaluation_splits":["eval"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"xstory_cloze:ru","suite":["lighteval"],"prompt_function":"storycloze","hf_repo":"juletxara/xstory_cloze","hf_subset":"ru","hf_avail_splits":["training", "eval"],"evaluation_splits":["eval"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"xstory_cloze:zh","suite":["lighteval"],"prompt_function":"storycloze","hf_repo":"juletxara/xstory_cloze","hf_subset":"zh","hf_avail_splits":["training", "eval"],"evaluation_splits":["eval"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"xstory_cloze:es","suite":["lighteval"],"prompt_function":"storycloze","hf_repo":"juletxara/xstory_cloze","hf_subset":"es","hf_avail_splits":["training", "eval"],"evaluation_splits":["eval"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"xstory_cloze:ar","suite":["lighteval"],"prompt_function":"storycloze","hf_repo":"juletxara/xstory_cloze","hf_subset":"ar","hf_avail_splits":["training", "eval"],"evaluation_splits":["eval"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"xstory_cloze:hi","suite":["lighteval"],"prompt_function":"storycloze","hf_repo":"juletxara/xstory_cloze","hf_subset":"hi","hf_avail_splits":["training", "eval"],"evaluation_splits":["eval"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"xstory_cloze:id","suite":["lighteval"],"prompt_function":"storycloze","hf_repo":"juletxara/xstory_cloze","hf_subset":"id","hf_avail_splits":["training", "eval"],"evaluation_splits":["eval"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"xstory_cloze:te","suite":["lighteval"],"prompt_function":"storycloze","hf_repo":"juletxara/xstory_cloze","hf_subset":"te","hf_avail_splits":["training", "eval"],"evaluation_splits":["eval"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"xstory_cloze:sw","suite":["lighteval"],"prompt_function":"storycloze","hf_repo":"juletxara/xstory_cloze","hf_subset":"sw","hf_avail_splits":["training", "eval"],"evaluation_splits":["eval"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"xstory_cloze:eu","suite":["lighteval"],"prompt_function":"storycloze","hf_repo":"juletxara/xstory_cloze","hf_subset":"eu","hf_avail_splits":["training", "eval"],"evaluation_splits":["eval"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"xstory_cloze:my","suite":["lighteval"],"prompt_function":"storycloze","hf_repo":"juletxara/xstory_cloze","hf_subset":"my","hf_avail_splits":["training", "eval"],"evaluation_splits":["eval"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"xwinograd:en","suite":["lighteval"],"prompt_function":"winogrande","hf_repo":"Muennighoff/xwinograd","hf_subset":"en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"xwinograd:fr","suite":["lighteval"],"prompt_function":"winogrande","hf_repo":"Muennighoff/xwinograd","hf_subset":"fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"xwinograd:jp","suite":["lighteval"],"prompt_function":"winogrande","hf_repo":"Muennighoff/xwinograd","hf_subset":"jp","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"xwinograd:pt","suite":["lighteval"],"prompt_function":"winogrande","hf_repo":"Muennighoff/xwinograd","hf_subset":"pt","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"xwinograd:ru","suite":["lighteval"],"prompt_function":"winogrande","hf_repo":"Muennighoff/xwinograd","hf_subset":"ru","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"xwinograd:zh","suite":["lighteval"],"prompt_function":"winogrande","hf_repo":"Muennighoff/xwinograd","hf_subset":"zh","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"abstract_narrative_understanding","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"abstract_narrative_understanding","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"agieval:aqua-rat","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-aqua-rat","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"agieval:gaokao-biology","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-biology","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"agieval:gaokao-chemistry","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-chemistry","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"agieval:gaokao-chinese","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-chinese","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"agieval:gaokao-english","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-english","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"agieval:gaokao-geography","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-geography","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"agieval:gaokao-history","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-history","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"agieval:gaokao-mathqa","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-mathqa","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"agieval:gaokao-physics","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-physics","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"agieval:logiqa-en","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-logiqa-en","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"agieval:logiqa-zh","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-logiqa-zh","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"agieval:lsat-ar","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-lsat-ar","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"agieval:lsat-lr","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-lsat-lr","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"agieval:lsat-rc","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-lsat-rc","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"agieval:sat-en","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-sat-en","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"agieval:sat-en-without-passage","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-sat-en-without-passage","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"agieval:sat-math","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-sat-math","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"anachronisms","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"anachronisms","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"analogical_similarity","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"analogical_similarity","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"analytic_entailment","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"analytic_entailment","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"anli","suite":["lighteval","anli"],"prompt_function":"anli","hf_repo":"anli","hf_subset":"plain_text","hf_avail_splits":["train_r1","dev_r1","train_r2","dev_r2","train_r3","dev_r3","test_r1","test_r2","test_r3"],"evaluation_splits":["test_r1","test_r2","test_r3"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"anli:r1","suite":["lighteval","anli"],"prompt_function":"anli","hf_repo":"anli","hf_subset":"plain_text","hf_avail_splits":["train_r1","dev_r1","test_r1"],"evaluation_splits":["test_r1"],"few_shots_split":"train_r1","few_shots_select":"random_sampling_from_train","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"anli:r2","suite":["lighteval","anli"],"prompt_function":"anli","hf_repo":"anli","hf_subset":"plain_text","hf_avail_splits":["train_r2","dev_r2","test_r2"],"evaluation_splits":["test_r2"],"few_shots_split":"train_r2","few_shots_select":"random_sampling_from_train","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"anli:r3","suite":["lighteval","anli"],"prompt_function":"anli","hf_repo":"anli","hf_subset":"plain_text","hf_avail_splits":["train_r3","dev_r3","test_r3"],"evaluation_splits":["test_r3"],"few_shots_split":"train_r3","few_shots_select":"random_sampling_from_train","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"arc:c:letters","suite":["original","arc"],"prompt_function":"arc_with_options_letters_predict","hf_repo":"ai2_arc","hf_subset":"ARC-Challenge","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"arc:c:options","suite":["original","arc"],"prompt_function":"arc_with_options","hf_repo":"ai2_arc","hf_subset":"ARC-Challenge","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"arc:c:simple","suite":["original","arc"],"prompt_function":"arc","hf_repo":"ai2_arc","hf_subset":"ARC-Challenge","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"arc:challenge","suite":["leaderboard","arc"],"prompt_function":"arc","hf_repo":"ai2_arc","hf_subset":"ARC-Challenge","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling_from_train","generation_size":1,"metric":["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"arc:easy","suite":["lighteval","arc"],"prompt_function":"arc","hf_repo":"ai2_arc","hf_subset":"ARC-Easy","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling_from_train","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"arithmetic:1dc","suite":["lighteval","arithmetic"],"prompt_function":"arithmetic","hf_repo":"EleutherAI\/arithmetic","hf_subset":"arithmetic_1dc","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"arithmetic:2da","suite":["lighteval","arithmetic"],"prompt_function":"arithmetic","hf_repo":"EleutherAI\/arithmetic","hf_subset":"arithmetic_2da","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"arithmetic:2dm","suite":["lighteval","arithmetic"],"prompt_function":"arithmetic","hf_repo":"EleutherAI\/arithmetic","hf_subset":"arithmetic_2dm","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"arithmetic:2ds","suite":["lighteval","arithmetic"],"prompt_function":"arithmetic","hf_repo":"EleutherAI\/arithmetic","hf_subset":"arithmetic_2ds","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"arithmetic:3da","suite":["lighteval","arithmetic"],"prompt_function":"arithmetic","hf_repo":"EleutherAI\/arithmetic","hf_subset":"arithmetic_3da","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"arithmetic:3ds","suite":["lighteval","arithmetic"],"prompt_function":"arithmetic","hf_repo":"EleutherAI\/arithmetic","hf_subset":"arithmetic_3ds","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"arithmetic:4da","suite":["lighteval","arithmetic"],"prompt_function":"arithmetic","hf_repo":"EleutherAI\/arithmetic","hf_subset":"arithmetic_4da","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"arithmetic:4ds","suite":["lighteval","arithmetic"],"prompt_function":"arithmetic","hf_repo":"EleutherAI\/arithmetic","hf_subset":"arithmetic_4ds","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"arithmetic:5da","suite":["lighteval","arithmetic"],"prompt_function":"arithmetic","hf_repo":"EleutherAI\/arithmetic","hf_subset":"arithmetic_5da","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"arithmetic:5ds","suite":["lighteval","arithmetic"],"prompt_function":"arithmetic","hf_repo":"EleutherAI\/arithmetic","hf_subset":"arithmetic_5ds","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"arithmetic_bb","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"arithmetic","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"ascii_word_recognition","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"ascii_word_recognition","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"asdiv","suite":["lighteval"],"prompt_function":"asdiv","hf_repo":"EleutherAI\/asdiv","hf_subset":"asdiv","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"authorship_verification","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"authorship_verification","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"auto_categorization","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"auto_categorization","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"auto_debugging","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_and_after_query","hf_repo":"bigbench","hf_subset":"auto_debugging","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["perfect_exact_match"],"stop_sequence":null,"output_regex":"[^\\.\\?\\!\\;\\n]+", "trust_dataset": true,"version":0} +{"name":"babi_qa","suite":["helm"],"prompt_function":"babi_qa","hf_repo":"facebook\/babi_qa","hf_subset":"en-valid-qa1","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:causal_judgment","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"causal_judgement","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} +{"name":"bigbench:date_understanding","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"date_understanding","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} +{"name":"bigbench:disambiguation_qa","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"disambiguation_qa","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} +{"name":"bigbench:geometric_shapes","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"geometric_shapes","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} +{"name":"bigbench:logical_deduction_five_objects","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"logical_deduction_five_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} +{"name":"bigbench:logical_deduction_seven_objects","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"logical_deduction_seven_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} +{"name":"bigbench:logical_deduction_three_objects","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"logical_deduction_three_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} +{"name":"bigbench:movie_recommendation","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"movie_recommendation","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} +{"name":"bigbench:navigate","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"navigate","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} +{"name":"bigbench:reasoning_about_colored_objects","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"reasoning_about_colored_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} +{"name":"bigbench:ruin_names","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"ruin_names","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} +{"name":"bigbench:salient_translation_error_detection","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"salient_translation_error_detection","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} +{"name":"bigbench:snarks","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"snarks","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} +{"name":"bigbench:sports_understanding","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"sports_understanding","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} +{"name":"bigbench:temporal_sequences","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"temporal_sequences","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} +{"name":"bigbench:tracking_shuffled_objects_five_objects","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"tracking_shuffled_objects_five_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} +{"name":"bigbench:tracking_shuffled_objects_seven_objects","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"tracking_shuffled_objects_seven_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} +{"name":"bigbench:tracking_shuffled_objects_three_objects","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"tracking_shuffled_objects_three_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} +{"name":"bigbench:causal_judgment","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"causal_judgement","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} +{"name":"bigbench:date_understanding","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"date_understanding","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} +{"name":"bigbench:disambiguation_qa","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"disambiguation_qa","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} +{"name":"bigbench:geometric_shapes","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"geometric_shapes","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} +{"name":"bigbench:logical_deduction_five_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"logical_deduction_five_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} +{"name":"bigbench:logical_deduction_seven_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"logical_deduction_seven_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} +{"name":"bigbench:logical_deduction_three_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"logical_deduction_three_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} +{"name":"bigbench:movie_recommendation","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"movie_recommendation","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} +{"name":"bigbench:navigate","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"navigate","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} +{"name":"bigbench:reasoning_about_colored_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"reasoning_about_colored_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} +{"name":"bigbench:ruin_names","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"ruin_names","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} +{"name":"bigbench:salient_translation_error_detection","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"salient_translation_error_detection","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} +{"name":"bigbench:snarks","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"snarks","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} +{"name":"bigbench:sports_understanding","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"sports_understanding","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} +{"name":"bigbench:temporal_sequences","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"temporal_sequences","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} +{"name":"bigbench:tracking_shuffled_objects_five_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"tracking_shuffled_objects_five_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} +{"name":"bigbench:tracking_shuffled_objects_seven_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"tracking_shuffled_objects_seven_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} +{"name":"bigbench:tracking_shuffled_objects_three_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"tracking_shuffled_objects_three_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} +{"name":"bbh:boolean_expressions","suite":["harness"],"prompt_function":"bbh_boolean_expressions","hf_repo":"lukaemon/bbh","hf_subset":"boolean_expressions","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} +{"name":"bbh:causal_judgment","suite":["harness"],"prompt_function":"bbh_causal_judgment","hf_repo":"lukaemon/bbh","hf_subset":"causal_judgement","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} +{"name":"bbh:date_understanding","suite":["harness"],"prompt_function":"bbh_date_understanding","hf_repo":"lukaemon/bbh","hf_subset":"date_understanding","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} +{"name":"bbh:disambiguation_qa","suite":["harness"],"prompt_function":"bbh_disambiguation_qa","hf_repo":"lukaemon/bbh","hf_subset":"disambiguation_qa","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} +{"name":"bbh:dyck_languages","suite":["harness"],"prompt_function":"bbh_dyck_languages","hf_repo":"lukaemon/bbh","hf_subset":"dyck_languages","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} +{"name":"bbh:formal_fallacies","suite":["harness"],"prompt_function":"bbh_formal_fallacies","hf_repo":"lukaemon/bbh","hf_subset":"formal_fallacies","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} +{"name":"bbh:geometric_shapes","suite":["harness"],"prompt_function":"bbh_geometric_shapes","hf_repo":"lukaemon/bbh","hf_subset":"geometric_shapes","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} +{"name":"bbh:hyperbaton","suite":["harness"],"prompt_function":"bbh_hyperbaton","hf_repo":"lukaemon/bbh","hf_subset":"hyperbaton","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} +{"name":"bbh:logical_deduction_five_objects","suite":["harness"],"prompt_function":"bbh_logical_deduction_five_objects","hf_repo":"lukaemon/bbh","hf_subset":"logical_deduction_five_objects","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} +{"name":"bbh:logical_deduction_seven_objects","suite":["harness"],"prompt_function":"bbh_logical_deduction_seven_objects","hf_repo":"lukaemon/bbh","hf_subset":"logical_deduction_seven_objects","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} +{"name":"bbh:logical_deduction_three_objects","suite":["harness"],"prompt_function":"bbh_logical_deduction_three_objects","hf_repo":"lukaemon/bbh","hf_subset":"logical_deduction_three_objects","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} +{"name":"bbh:movie_recommendation","suite":["harness"],"prompt_function":"bbh_movie_recommendation","hf_repo":"lukaemon/bbh","hf_subset":"movie_recommendation","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} +{"name":"bbh:multistep_arithmetic_two","suite":["harness"],"prompt_function":"bbh_multistep_arithmetic_two","hf_repo":"lukaemon/bbh","hf_subset":"multistep_arithmetic_two","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} +{"name":"bbh:navigate","suite":["harness"],"prompt_function":"bbh_navigate","hf_repo":"lukaemon/bbh","hf_subset":"navigate","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} +{"name":"bbh:object_counting","suite":["harness"],"prompt_function":"bbh_object_counting","hf_repo":"lukaemon/bbh","hf_subset":"object_counting","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} +{"name":"bbh:penguins_in_a_table","suite":["harness"],"prompt_function":"bbh_penguins_in_a_table","hf_repo":"lukaemon/bbh","hf_subset":"penguins_in_a_table","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} +{"name":"bbh:reasoning_about_colored_objects","suite":["harness"],"prompt_function":"bbh_reasoning_about_colored_objects","hf_repo":"lukaemon/bbh","hf_subset":"reasoning_about_colored_objects","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} +{"name":"bbh:ruin_names","suite":["harness"],"prompt_function":"bbh_ruin_names","hf_repo":"lukaemon/bbh","hf_subset":"ruin_names","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} +{"name":"bbh:salient_translation_error_detection","suite":["harness"],"prompt_function":"bbh_salient_translation_error_detection","hf_repo":"lukaemon/bbh","hf_subset":"salient_translation_error_detection","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} +{"name":"bbh:snarks","suite":["harness"],"prompt_function":"bbh_snarks","hf_repo":"lukaemon/bbh","hf_subset":"snarks","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} +{"name":"bbh:sports_understanding","suite":["harness"],"prompt_function":"bbh_sports_understanding","hf_repo":"lukaemon/bbh","hf_subset":"sports_understanding","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} +{"name":"bbh:temporal_sequences","suite":["harness"],"prompt_function":"bbh_temporal_sequences","hf_repo":"lukaemon/bbh","hf_subset":"temporal_sequences","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} +{"name":"bbh:tracking_shuffled_objects_five_objects","suite":["harness"],"prompt_function":"bbh_tracking_shuffled_objects_five_objects","hf_repo":"lukaemon/bbh","hf_subset":"tracking_shuffled_objects_five_objects","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} +{"name":"bbh:tracking_shuffled_objects_seven_objects","suite":["harness"],"prompt_function":"bbh_tracking_shuffled_objects_seven_objects","hf_repo":"lukaemon/bbh","hf_subset":"tracking_shuffled_objects_seven_objects","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} +{"name":"bbh:tracking_shuffled_objects_three_objects","suite":["harness"],"prompt_function":"bbh_tracking_shuffled_objects_three_objects","hf_repo":"lukaemon/bbh","hf_subset":"tracking_shuffled_objects_three_objects","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} +{"name":"bbh:web_of_lies","suite":["harness"],"prompt_function":"bbh_web_of_lies","hf_repo":"lukaemon/bbh","hf_subset":"web_of_lies","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} +{"name":"bbh:word_sorting","suite":["harness"],"prompt_function":"bbh_word_sorting","hf_repo":"lukaemon/bbh","hf_subset":"word_sorting","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} +{"name":"bbq","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"all","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bbq:Age","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"Age","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bbq:Disability_status","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"Disability_status","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bbq:Gender_identity","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"Gender_identity","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bbq:Nationality","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"Nationality","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bbq:Physical_appearance","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"Physical_appearance","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bbq:Race_ethnicity","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"Race_ethnicity","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bbq:Race_x_SES","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"Race_x_SES","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bbq:Race_x_gender","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"Race_x_gender","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bbq:Religion","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"Religion","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bbq:SES","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"SES","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bbq:Sexual_orientation","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"Sexual_orientation","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bbq_lite_json","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_whitespace_after_query","hf_repo":"bigbench","hf_subset":"bbq_lite_json","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:auto_debugging","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"auto_debugging","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:bbq_lite_json:age_ambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-age_ambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:bbq_lite_json:age_disambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-age_disambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:bbq_lite_json:disability_status_ambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-disability_status_ambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:bbq_lite_json:disability_status_disambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-disability_status_disambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:bbq_lite_json:gender_identity_ambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-gender_identity_ambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:bbq_lite_json:gender_identity_disambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-gender_identity_disambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:bbq_lite_json:nationality_ambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-nationality_ambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:bbq_lite_json:nationality_disambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-nationality_disambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:bbq_lite_json:physical_appearance_ambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-physical_appearance_ambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:bbq_lite_json:physical_appearance_disambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-physical_appearance_disambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:bbq_lite_json:race_ethnicity_ambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-race_ethnicity_ambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:bbq_lite_json:race_ethnicity_disambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-race_ethnicity_disambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:bbq_lite_json:religion_ambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-religion_ambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:bbq_lite_json:religion_disambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-religion_disambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:bbq_lite_json:ses_ambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-ses_ambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:bbq_lite_json:ses_disambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-ses_disambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:bbq_lite_json:sexual_orientation_ambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-sexual_orientation_ambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:bbq_lite_json:sexual_orientation_disambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-sexual_orientation_disambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:code_line_description","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"code_line_description","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:conceptual_combinations:contradictions","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conceptual_combinations-contradictions","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:conceptual_combinations:emergent_properties","suite":["helm"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conceptual_combinations-emergent_properties","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:conceptual_combinations:fanciful_fictional_combinations","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conceptual_combinations-fanciful_fictional_combinations","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:conceptual_combinations:homonyms","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conceptual_combinations-homonyms","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:conceptual_combinations:invented_words","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conceptual_combinations-invented_words","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:conlang_translation:adna_from","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-adna_from","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:conlang_translation:adna_to","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-adna_to","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:conlang_translation:atikampe_from","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-atikampe_from","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:conlang_translation:atikampe_to","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-atikampe_to","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:conlang_translation:gornam_from","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-gornam_from","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:conlang_translation:gornam_to","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-gornam_to","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:conlang_translation:holuan_from","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-holuan_from","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:conlang_translation:holuan_to","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-holuan_to","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:conlang_translation:mkafala_from","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-mkafala_from","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:conlang_translation:mkafala_to","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-mkafala_to","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:conlang_translation:postpositive_english_from","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-postpositive_english_from","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:conlang_translation:postpositive_english_to","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-postpositive_english_to","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:conlang_translation:unapuri_from","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-unapuri_from","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:conlang_translation:unapuri_to","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-unapuri_to","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:conlang_translation:vaomi_from","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-vaomi_from","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:conlang_translation:vaomi_to","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-vaomi_to","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:emoji_movie","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"emoji_movie","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:formal_fallacies_syllogisms_negation","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"formal_fallacies_syllogisms_negation","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:hindu_knowledge","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"hindu_knowledge","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:known_unknowns","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"known_unknowns","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:language_identification","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"language_identification","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:linguistics_puzzles","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"linguistics_puzzles","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:logic_grid_puzzle","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"logic_grid_puzzle","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:logical_deduction-five_objects","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"logical_deduction-five_objects","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:logical_deduction-seven_objects","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"logical_deduction-seven_objects","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:logical_deduction-three_objects","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"logical_deduction-three_objects","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:misconceptions_russian","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"misconceptions_russian","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:novel_concepts","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"novel_concepts","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:operators","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"operators","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:parsinlu_reading_comprehension","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"parsinlu_reading_comprehension","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:play_dialog_same_or_different","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"play_dialog_same_or_different","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:repeat_copy_logic","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"repeat_copy_logic","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:strange_stories-boolean","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"strange_stories-boolean","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:strange_stories-multiple_choice","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"strange_stories-multiple_choice","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:strategyqa","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"strategyqa","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:symbol_interpretation-adversarial","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"symbol_interpretation-adversarial","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:symbol_interpretation-emoji_agnostic","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"symbol_interpretation-emoji_agnostic","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:symbol_interpretation-name_agnostic","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"symbol_interpretation-name_agnostic","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:symbol_interpretation-plain","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"symbol_interpretation-plain","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:symbol_interpretation-tricky","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"symbol_interpretation-tricky","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:vitaminc_fact_verification","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"vitaminc_fact_verification","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bigbench:winowhy","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"winowhy","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:adjunct_island","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"adjunct_island","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:adjunct_island","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"adjunct_island","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:anaphor_gender_agreement","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"anaphor_gender_agreement","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:anaphor_gender_agreement","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"anaphor_gender_agreement","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:anaphor_number_agreement","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"anaphor_number_agreement","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:anaphor_number_agreement","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"anaphor_number_agreement","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:animate_subject_passive","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"animate_subject_passive","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:animate_subject_passive","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"animate_subject_passive","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:animate_subject_trans","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"animate_subject_trans","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:animate_subject_trans","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"animate_subject_trans","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:causative","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"causative","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:causative","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"causative","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:complex_NP_island","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"complex_NP_island","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:complex_NP_island","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"complex_NP_island","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:coordinate_structure_constraint_complex_left_branch","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"coordinate_structure_constraint_complex_left_branch","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:coordinate_structure_constraint_complex_left_branch","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"coordinate_structure_constraint_complex_left_branch","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:coordinate_structure_constraint_object_extraction","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"coordinate_structure_constraint_object_extraction","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:coordinate_structure_constraint_object_extraction","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"coordinate_structure_constraint_object_extraction","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:determiner_noun_agreement_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:determiner_noun_agreement_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:determiner_noun_agreement_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:determiner_noun_agreement_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:determiner_noun_agreement_irregular_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_irregular_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:determiner_noun_agreement_irregular_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_irregular_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:determiner_noun_agreement_irregular_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_irregular_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:determiner_noun_agreement_irregular_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_irregular_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:determiner_noun_agreement_with_adj_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_with_adj_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:determiner_noun_agreement_with_adj_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_with_adj_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:determiner_noun_agreement_with_adj_irregular_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_with_adj_irregular_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:determiner_noun_agreement_with_adj_irregular_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_with_adj_irregular_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:determiner_noun_agreement_with_adj_irregular_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_with_adj_irregular_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:determiner_noun_agreement_with_adj_irregular_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_with_adj_irregular_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:determiner_noun_agreement_with_adjective_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_with_adjective_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:determiner_noun_agreement_with_adjective_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_with_adjective_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:distractor_agreement_relational_noun","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"distractor_agreement_relational_noun","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:distractor_agreement_relational_noun","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"distractor_agreement_relational_noun","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:distractor_agreement_relative_clause","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"distractor_agreement_relative_clause","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:distractor_agreement_relative_clause","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"distractor_agreement_relative_clause","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:drop_argument","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"drop_argument","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:drop_argument","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"drop_argument","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:ellipsis_n_bar_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"ellipsis_n_bar_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:ellipsis_n_bar_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"ellipsis_n_bar_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:ellipsis_n_bar_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"ellipsis_n_bar_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:ellipsis_n_bar_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"ellipsis_n_bar_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:existential_there_object_raising","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"existential_there_object_raising","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:existential_there_object_raising","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"existential_there_object_raising","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:existential_there_quantifiers_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"existential_there_quantifiers_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:existential_there_quantifiers_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"existential_there_quantifiers_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:existential_there_quantifiers_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"existential_there_quantifiers_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:existential_there_quantifiers_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"existential_there_quantifiers_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:existential_there_subject_raising","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"existential_there_subject_raising","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:existential_there_subject_raising","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"existential_there_subject_raising","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:expletive_it_object_raising","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"expletive_it_object_raising","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:expletive_it_object_raising","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"expletive_it_object_raising","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:inchoative","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"inchoative","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:inchoative","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"inchoative","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:intransitive","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"intransitive","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:intransitive","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"intransitive","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:irregular_past_participle_adjectives","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"irregular_past_participle_adjectives","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:irregular_past_participle_adjectives","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"irregular_past_participle_adjectives","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:irregular_past_participle_verbs","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"irregular_past_participle_verbs","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:irregular_past_participle_verbs","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"irregular_past_participle_verbs","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:irregular_plural_subject_verb_agreement_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"irregular_plural_subject_verb_agreement_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:irregular_plural_subject_verb_agreement_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"irregular_plural_subject_verb_agreement_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:irregular_plural_subject_verb_agreement_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"irregular_plural_subject_verb_agreement_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:irregular_plural_subject_verb_agreement_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"irregular_plural_subject_verb_agreement_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:left_branch_island_echo_question","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"left_branch_island_echo_question","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:left_branch_island_echo_question","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"left_branch_island_echo_question","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:left_branch_island_simple_question","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"left_branch_island_simple_question","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:left_branch_island_simple_question","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"left_branch_island_simple_question","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:matrix_question_npi_licensor_present","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"matrix_question_npi_licensor_present","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:matrix_question_npi_licensor_present","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"matrix_question_npi_licensor_present","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:npi_present_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"npi_present_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:npi_present_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"npi_present_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:npi_present_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"npi_present_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:npi_present_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"npi_present_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:only_npi_licensor_present","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"only_npi_licensor_present","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:only_npi_licensor_present","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"only_npi_licensor_present","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:only_npi_scope","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"only_npi_scope","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:only_npi_scope","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"only_npi_scope","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:passive_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"passive_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:passive_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"passive_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:passive_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"passive_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:passive_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"passive_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:principle_A_c_command","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"principle_A_c_command","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:principle_A_c_command","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"principle_A_c_command","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:principle_A_case_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"principle_A_case_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:principle_A_case_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"principle_A_case_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:principle_A_case_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"principle_A_case_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:principle_A_case_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"principle_A_case_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:principle_A_domain_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"principle_A_domain_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:principle_A_domain_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"principle_A_domain_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:principle_A_domain_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"principle_A_domain_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:principle_A_domain_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"principle_A_domain_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:principle_A_domain_3","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"principle_A_domain_3","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:principle_A_domain_3","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"principle_A_domain_3","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:principle_A_reconstruction","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"principle_A_reconstruction","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:principle_A_reconstruction","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"principle_A_reconstruction","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:regular_plural_subject_verb_agreement_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"regular_plural_subject_verb_agreement_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:regular_plural_subject_verb_agreement_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"regular_plural_subject_verb_agreement_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:regular_plural_subject_verb_agreement_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"regular_plural_subject_verb_agreement_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:regular_plural_subject_verb_agreement_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"regular_plural_subject_verb_agreement_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:sentential_negation_npi_licensor_present","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"sentential_negation_npi_licensor_present","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:sentential_negation_npi_licensor_present","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"sentential_negation_npi_licensor_present","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:sentential_negation_npi_scope","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"sentential_negation_npi_scope","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:sentential_negation_npi_scope","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"sentential_negation_npi_scope","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:sentential_subject_island","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"sentential_subject_island","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:sentential_subject_island","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"sentential_subject_island","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:superlative_quantifiers_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"superlative_quantifiers_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:superlative_quantifiers_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"superlative_quantifiers_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:superlative_quantifiers_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"superlative_quantifiers_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:superlative_quantifiers_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"superlative_quantifiers_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:tough_vs_raising_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"tough_vs_raising_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:tough_vs_raising_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"tough_vs_raising_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:tough_vs_raising_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"tough_vs_raising_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:tough_vs_raising_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"tough_vs_raising_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:transitive","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"transitive","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:transitive","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"transitive","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:wh_island","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"wh_island","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:wh_island","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"wh_island","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:wh_questions_object_gap","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"wh_questions_object_gap","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:wh_questions_object_gap","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"wh_questions_object_gap","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:wh_questions_subject_gap","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"wh_questions_subject_gap","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:wh_questions_subject_gap","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"wh_questions_subject_gap","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:wh_questions_subject_gap_long_distance","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"wh_questions_subject_gap_long_distance","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:wh_questions_subject_gap_long_distance","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"wh_questions_subject_gap_long_distance","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:wh_vs_that_no_gap","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"wh_vs_that_no_gap","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:wh_vs_that_no_gap","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"wh_vs_that_no_gap","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:wh_vs_that_no_gap_long_distance","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"wh_vs_that_no_gap_long_distance","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:wh_vs_that_no_gap_long_distance","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"wh_vs_that_no_gap_long_distance","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:wh_vs_that_with_gap","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"wh_vs_that_with_gap","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:wh_vs_that_with_gap","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"wh_vs_that_with_gap","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:wh_vs_that_with_gap_long_distance","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"wh_vs_that_with_gap_long_distance","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"blimp:wh_vs_that_with_gap_long_distance","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"wh_vs_that_with_gap_long_distance","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bold","suite":["helm"],"prompt_function":"bold","hf_repo":"lighteval\/bold_helm","hf_subset":"all","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["prediction_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bold:gender","suite":["helm"],"prompt_function":"bold","hf_repo":"lighteval\/bold_helm","hf_subset":"gender","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["prediction_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bold:political_ideology","suite":["helm"],"prompt_function":"bold","hf_repo":"lighteval\/bold_helm","hf_subset":"political_ideology","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["prediction_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bold:profession","suite":["helm"],"prompt_function":"bold","hf_repo":"lighteval\/bold_helm","hf_subset":"profession","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["prediction_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bold:race","suite":["helm"],"prompt_function":"bold","hf_repo":"lighteval\/bold_helm","hf_subset":"race","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["prediction_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bold:religious_ideology","suite":["helm"],"prompt_function":"bold","hf_repo":"lighteval\/bold_helm","hf_subset":"religious_ideology","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["prediction_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"boolq","suite":["helm","helm_general"],"prompt_function":"boolq_helm","hf_repo":"lighteval\/boolq_helm","hf_subset":"default","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"boolq:contrastset","suite":["helm"],"prompt_function":"boolq_helm_contrastset","hf_repo":"lighteval\/boolq_helm","hf_subset":"default","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"bridging_anaphora_resolution_barqa","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"bridging_anaphora_resolution_barqa","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"causal_judgment","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"causal_judgment","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"cause_and_effect","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"cause_and_effect","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"checkmate_in_one","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"checkmate_in_one","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"chess_state_tracking","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"chess_state_tracking","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"chinese_remainder_theorem","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"chinese_remainder_theorem","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"cifar10_classification","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"cifar10_classification","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"civil_comments","suite":["helm","helm_general"],"prompt_function":"civil_comments","hf_repo":"lighteval\/civil_comments_helm","hf_subset":"all","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"civil_comments:LGBTQ","suite":["helm"],"prompt_function":"civil_comments","hf_repo":"lighteval\/civil_comments_helm","hf_subset":"LGBTQ","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"civil_comments:black","suite":["helm"],"prompt_function":"civil_comments","hf_repo":"lighteval\/civil_comments_helm","hf_subset":"black","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"civil_comments:christian","suite":["helm"],"prompt_function":"civil_comments","hf_repo":"lighteval\/civil_comments_helm","hf_subset":"christian","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"civil_comments:female","suite":["helm"],"prompt_function":"civil_comments","hf_repo":"lighteval\/civil_comments_helm","hf_subset":"female","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"civil_comments:male","suite":["helm"],"prompt_function":"civil_comments","hf_repo":"lighteval\/civil_comments_helm","hf_subset":"male","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"civil_comments:muslim","suite":["helm"],"prompt_function":"civil_comments","hf_repo":"lighteval\/civil_comments_helm","hf_subset":"muslim","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"civil_comments:other_religions","suite":["helm"],"prompt_function":"civil_comments","hf_repo":"lighteval\/civil_comments_helm","hf_subset":"other_religions","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"civil_comments:white","suite":["helm"],"prompt_function":"civil_comments","hf_repo":"lighteval\/civil_comments_helm","hf_subset":"white","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"code_line_description","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_and_after_query","hf_repo":"bigbench","hf_subset":"code_line_description","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"codenames","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"codenames","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["rouge_t5","bleu","bleu"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"color","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"color","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["rouge_t5","bleu","loglikelihood_acc","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"common_morpheme","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"common_morpheme","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"commonsenseqa","suite":["helm","commonsense_scenario"],"prompt_function":"commonsense_qa","hf_repo":"commonsense_qa","hf_subset":"default","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"conceptual_combinations","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_whitespace_after_query","hf_repo":"bigbench","hf_subset":"conceptual_combinations","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"conlang_translation","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_whitespace_after_query","hf_repo":"bigbench","hf_subset":"conlang_translation","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge_t5","bleu","perfect_exact_match"],"stop_sequence":[".",";","!","?"],"output_regex":"[^\\.\\?\\!\\;\\n]+", "trust_dataset": true,"version":0} +{"name":"contextual_parametric_knowledge_conflicts","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"contextual_parametric_knowledge_conflicts","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["rouge_t5","loglikelihood_acc","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"copyright:n_books_1000-extractions_per_book_1-prefix_length_125","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"n_books_1000-extractions_per_book_1-prefix_length_125","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"copyright:n_books_1000-extractions_per_book_1-prefix_length_25","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"n_books_1000-extractions_per_book_1-prefix_length_25","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"copyright:n_books_1000-extractions_per_book_1-prefix_length_5","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"n_books_1000-extractions_per_book_1-prefix_length_5","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"copyright:n_books_1000-extractions_per_book_3-prefix_length_125","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"n_books_1000-extractions_per_book_3-prefix_length_125","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"copyright:n_books_1000-extractions_per_book_3-prefix_length_25","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"n_books_1000-extractions_per_book_3-prefix_length_25","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"copyright:n_books_1000-extractions_per_book_3-prefix_length_5","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"n_books_1000-extractions_per_book_3-prefix_length_5","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"copyright:oh_the_places","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"oh_the_places","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"copyright:pilot","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"pilot","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"copyright:popular_books-prefix_length_10","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"popular_books-prefix_length_10","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"copyright:popular_books-prefix_length_125","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"popular_books-prefix_length_125","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"copyright:popular_books-prefix_length_25","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"popular_books-prefix_length_25","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"copyright:popular_books-prefix_length_250","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"popular_books-prefix_length_250","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"copyright:popular_books-prefix_length_5","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"popular_books-prefix_length_5","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"copyright:popular_books-prefix_length_50","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"popular_books-prefix_length_50","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"copyright:prompt_num_line_1-min_lines_20","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"prompt_num_line_1-min_lines_20","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"copyright:prompt_num_line_10-min_lines_20","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"prompt_num_line_10-min_lines_20","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"copyright:prompt_num_line_5-min_lines_20","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"prompt_num_line_5-min_lines_20","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"coqa","suite":["lighteval"],"prompt_function":"coqa","hf_repo":"coqa","hf_subset":"default","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["perfect_exact_match","f1_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"coqa_bb","suite":["lighteval","bigbench_programmatic","bigbench"],"prompt_function":"coqa","hf_repo":"coqa","hf_subset":"default","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["perfect_exact_match","f1_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"covid_dialogue","suite":["helm"],"prompt_function":"covid_dialogue","hf_repo":"lighteval\/covid_dialogue","hf_subset":"default","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":128,"metric":["exact_match","quasi_exact_match","f1_score","rougeL","bleu_1","bleu_4"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"crash_blossom","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"crash_blossom","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"crass_ai","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"crass_ai","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"cryobiology_spanish","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"cryobiology_spanish","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"cryptonite","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"cryptonite","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"cs_algorithms","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"cs_algorithms","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"dark_humor_detection","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"dark_humor_detection","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"date_understanding","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"date_understanding","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"disambiguation_qa","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"disambiguation_qa","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"discourse_marker_prediction","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"discourse_marker_prediction","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"disfl_qa","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"disfl_qa","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"drop","suite":["lighteval"],"prompt_function":"drop","hf_repo":"lighteval/drop_harness","hf_subset":"default","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":"train","few_shots_select":"random_sampling_from_train","generation_size":null,"metric":["drop"],"stop_sequence":["."],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"dyck_language:2","suite":["helm"],"prompt_function":"dyck_language","hf_repo":"lighteval\/DyckLanguage","hf_subset":"2","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"dyck_language:3","suite":["helm"],"prompt_function":"dyck_language","hf_repo":"lighteval\/DyckLanguage","hf_subset":"3","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"dyck_language:4","suite":["helm"],"prompt_function":"dyck_language","hf_repo":"lighteval\/DyckLanguage","hf_subset":"4","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"dyck_languages","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"dyck_languages","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"elementary_math_qa","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"elementary_math_qa","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"emoji_movie","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_whitespace_after_query","hf_repo":"bigbench","hf_subset":"emoji_movie","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["rouge_t5","bleu","loglikelihood_acc","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"emojis_emotion_prediction","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"emojis_emotion_prediction","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"empirical_judgments","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"empirical_judgments","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"english_proverbs","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"english_proverbs","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"english_russian_proverbs","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"english_russian_proverbs","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"entailed_polarity","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"entailed_polarity","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"entailed_polarity_hindi","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"entailed_polarity_hindi","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"entity_data_imputation:Buy","suite":["helm"],"prompt_function":"entity_data_imputation","hf_repo":"lighteval\/Buy","hf_subset":"default","hf_avail_splits":["train","test","valid"],"evaluation_splits":["valid","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"entity_data_imputation:Restaurant","suite":["helm"],"prompt_function":"entity_data_imputation","hf_repo":"lighteval\/Restaurant","hf_subset":"default","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"entity_matching:Abt_Buy","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"Abt_Buy","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"entity_matching:Amazon_Google","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"Amazon_Google","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"entity_matching:Beer","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"Beer","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"entity_matching:Company","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"Company","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"entity_matching:DBLP_ACM","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"DBLP_ACM","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"entity_matching:DBLP_GoogleScholar","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"DBLP_GoogleScholar","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"entity_matching:Dirty_DBLP_ACM","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"Dirty_DBLP_ACM","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"entity_matching:Dirty_DBLP_GoogleScholar","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"Dirty_DBLP_GoogleScholar","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"entity_matching:Dirty_Walmart_Amazon","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"Dirty_Walmart_Amazon","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"entity_matching:Dirty_iTunes_Amazon","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"Dirty_iTunes_Amazon","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"entity_matching:Fodors_Zagats","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"Fodors_Zagats","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"entity_matching:Walmart_Amazon","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"Walmart_Amazon","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"entity_matching:iTunes_Amazon","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"iTunes_Amazon","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"epistemic_reasoning","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"epistemic_reasoning","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"ethics:commonsense","suite":["lighteval","ethics"],"prompt_function":"ethics_commonsense","hf_repo":"lighteval\/hendrycks_ethics","hf_subset":"commonsense","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"ethics:deontology","suite":["lighteval","ethics"],"prompt_function":"ethics_deontology","hf_repo":"lighteval\/hendrycks_ethics","hf_subset":"deontology","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"ethics:justice","suite":["lighteval","ethics"],"prompt_function":"ethics_justice","hf_repo":"lighteval\/hendrycks_ethics","hf_subset":"justice","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"ethics:utilitarianism","suite":["lighteval","ethics"],"prompt_function":"ethics_utilitarianism","hf_repo":"lighteval\/hendrycks_ethics","hf_subset":"utilitarianism","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"ethics:virtue","suite":["lighteval","ethics"],"prompt_function":"ethics_virtue","hf_repo":"lighteval\/hendrycks_ethics","hf_subset":"virtue","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"evaluating_information_essentiality","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"evaluating_information_essentiality","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"fact_checker","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"fact_checker","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"fantasy_reasoning","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"fantasy_reasoning","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"few_shot_nlg","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"few_shot_nlg","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","bleurt"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"figure_of_speech_detection","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"figure_of_speech_detection","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"formal_fallacies_syllogisms_negation","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_whitespace_after_query","hf_repo":"bigbench","hf_subset":"formal_fallacies_syllogisms_negation","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"gem","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"gem","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","rouge_t5"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"gender_inclusive_sentences_german","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"gender_inclusive_sentences_german","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"general_knowledge","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"general_knowledge","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"geometric_shapes","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"geometric_shapes","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["rouge_t5","bleu","loglikelihood_acc","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"glue:cola","suite":["lighteval","glue"],"prompt_function":"cola","hf_repo":"glue","hf_subset":"cola","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc_single_token", "mcc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"glue:mnli","suite":["lighteval","glue"],"prompt_function":"mnli","hf_repo":"glue","hf_subset":"mnli_matched","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"glue:mnli_mismatched","suite":["lighteval","glue"],"prompt_function":"mnli","hf_repo":"glue","hf_subset":"mnli_mismatched","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"glue:mrpc","suite":["lighteval","glue"],"prompt_function":"mrpc","hf_repo":"glue","hf_subset":"mrpc","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc", "loglikelihood_f1"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"glue:qnli","suite":["lighteval","glue"],"prompt_function":"qnli","hf_repo":"glue","hf_subset":"qnli","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"glue:qqp","suite":["lighteval","glue"],"prompt_function":"qqp","hf_repo":"glue","hf_subset":"qqp","hf_avail_splits":["train","validation","test"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc", "loglikelihood_f1"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"glue:rte","suite":["lighteval","glue"],"prompt_function":"rte","hf_repo":"glue","hf_subset":"rte","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"glue:sst2","suite":["lighteval","glue"],"prompt_function":"sst","hf_repo":"glue","hf_subset":"sst2","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"glue:stsb","suite":["lighteval","glue"],"prompt_function":"stsb","hf_repo":"glue","hf_subset":"stsb","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"glue:wnli","suite":["lighteval","glue"],"prompt_function":"wnli","hf_repo":"glue","hf_subset":"wnli","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"goal_step_wikihow","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"goal_step_wikihow","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"gpqa","suite":["lighteval"],"prompt_function":"gpqa","hf_repo":"Idavidrein/gpqa","hf_subset":"gpqa_main","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"gre_reading_comprehension","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"gre_reading_comprehension","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"gsm8k","suite":["leaderboard"],"prompt_function":"gsm8k","hf_repo":"gsm8k","hf_subset":"main","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling_from_train","generation_size":256,"metric":["quasi_exact_match_gsm8k"],"stop_sequence":["Question:","Question",":"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"gsm8k","suite":["lighteval"],"prompt_function":"gsm8k","hf_repo":"gsm8k","hf_subset":"main","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling_from_train","generation_size":256,"metric":["quasi_exact_match_gsm8k","maj_at_8_gsm8k"],"stop_sequence":["Question:"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"headqa:en","suite":["lighteval","headqa"],"prompt_function":"headqa","hf_repo":"lighteval/headqa_harness","hf_subset":"en","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"headqa:es","suite":["lighteval","headqa"],"prompt_function":"headqa","hf_repo":"lighteval/headqa_harness","hf_subset":"es","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"hellaswag","suite":["leaderboard"],"prompt_function":"hellaswag_harness","hf_repo":"hellaswag","hf_subset":"default","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":"random_sampling_from_train","generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"hellaswag","suite":["helm","helm_general"],"prompt_function":"hellaswag_helm","hf_repo":"hellaswag","hf_subset":"default","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"hhh_alignment","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"hhh_alignment","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"hindi_question_answering","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"hindi_question_answering","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","rouge_t5","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"hindu_knowledge","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_whitespace_after_query","hf_repo":"bigbench","hf_subset":"hindu_knowledge","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"hinglish_toxicity","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"hinglish_toxicity","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"human_organs_senses","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"human_organs_senses","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"humaneval","suite":["helm","code_scenario"],"prompt_function":"humaneval","hf_repo":"openai_humaneval","hf_subset":"openai_humaneval","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":600,"metric":["code_humaneval"],"stop_sequence":["\nclass","\ndef","\nif","\nprint"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"hyperbaton","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"hyperbaton","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"identify_math_theorems","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"identify_math_theorems","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"identify_odd_metaphor","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"identify_odd_metaphor","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"imdb","suite":["helm","helm_general"],"prompt_function":"imdb","hf_repo":"lighteval\/IMDB_helm","hf_subset":"default","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"imdb:contrastset","suite":["helm"],"prompt_function":"imdb_contrastset","hf_repo":"lighteval\/IMDB_helm","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"implicatures","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"implicatures","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"implicit_relations","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"implicit_relations","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"intent_recognition","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"intent_recognition","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"interactive_qa_mmlu:abstract_algebra","suite":["helm","interactive_qa_mmlu_scenario"],"prompt_function":"mmlu_qa_abstract_algebra","hf_repo":"lighteval\/mmlu","hf_subset":"abstract_algebra","hf_avail_splits":["dev","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"interactive_qa_mmlu:college_chemistry","suite":["helm","interactive_qa_mmlu_scenario"],"prompt_function":"mmlu_qa_college_chemistry","hf_repo":"lighteval\/mmlu","hf_subset":"college_chemistry","hf_avail_splits":["dev","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"interactive_qa_mmlu:global_facts","suite":["helm","interactive_qa_mmlu_scenario"],"prompt_function":"mmlu_qa_global_facts","hf_repo":"lighteval\/mmlu","hf_subset":"global_facts","hf_avail_splits":["dev","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"interactive_qa_mmlu:miscellaneous","suite":["helm","interactive_qa_mmlu_scenario"],"prompt_function":"mmlu_qa_miscellaneous","hf_repo":"lighteval\/mmlu","hf_subset":"miscellaneous","hf_avail_splits":["dev","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"interactive_qa_mmlu:nutrition","suite":["helm","interactive_qa_mmlu_scenario"],"prompt_function":"mmlu_qa_nutrition","hf_repo":"lighteval\/mmlu","hf_subset":"nutrition","hf_avail_splits":["dev","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"interactive_qa_mmlu:us_foreign_policy","suite":["helm","interactive_qa_mmlu_scenario"],"prompt_function":"mmlu_qa_us_foreign_policy","hf_repo":"lighteval\/mmlu","hf_subset":"us_foreign_policy","hf_avail_splits":["dev","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"international_phonetic_alphabet_nli","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"international_phonetic_alphabet_nli","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"international_phonetic_alphabet_transliterate","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"international_phonetic_alphabet_transliterate","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","rouge_t5","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"intersect_geometry","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"intersect_geometry","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"irony_identification","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"irony_identification","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"iwslt17:ar-en","suite":["lighteval","harness_selection"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_ar-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"iwslt17:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"iwslt17:en-ar","suite":["lighteval","harness_selection"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_ar-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"iwslt17:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"iwslt17:en-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_en-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"iwslt17:en-ja","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_en-ja","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"iwslt17:en-ko","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_en-ko","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"iwslt17:en-zh","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_en-zh","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"iwslt17:fr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_fr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"iwslt17:ja-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_ja-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"iwslt17:ko-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_ko-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"iwslt17:zh-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_zh-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"kanji_ascii","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"kanji_ascii","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"kannada","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"kannada","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"key_value_maps","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"key_value_maps","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"known_unknowns","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_whitespace_after_query","hf_repo":"bigbench","hf_subset":"known_unknowns","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"lambada:standard","suite":["lighteval","lambada"],"prompt_function":"lambada","hf_repo":"lambada","hf_subset":"plain_text","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["target_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"lambada:standard_cloze","suite":["lighteval","lambada"],"prompt_function":"lambada_cloze","hf_repo":"lambada","hf_subset":"plain_text","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["target_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"lambada:openai","suite":["lighteval","lambada"],"prompt_function":"lambada","hf_repo":"EleutherAI\/lambada_openai","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["target_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"lambada:openai:de","suite":["lighteval","lambada"],"prompt_function":"lambada","hf_repo":"EleutherAI\/lambada_openai","hf_subset":"de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["target_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"lambada:openai:en","suite":["lighteval","lambada"],"prompt_function":"lambada","hf_repo":"EleutherAI\/lambada_openai","hf_subset":"en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["target_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"lambada:openai:es","suite":["lighteval","lambada"],"prompt_function":"lambada","hf_repo":"EleutherAI\/lambada_openai","hf_subset":"es","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["target_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"lambada:openai:fr","suite":["lighteval","lambada"],"prompt_function":"lambada","hf_repo":"EleutherAI\/lambada_openai","hf_subset":"fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["target_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"lambada:openai:it","suite":["lighteval","lambada"],"prompt_function":"lambada","hf_repo":"EleutherAI\/lambada_openai","hf_subset":"it","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["target_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"lambada:openai_cloze","suite":["lighteval","lambada"],"prompt_function":"lambada_cloze","hf_repo":"EleutherAI\/lambada_openai","hf_subset":"en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["target_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"language_games","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"language_games","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","rouge_t5","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"language_identification","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"language_identification","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"legal_summarization:billsum","suite":["helm"],"prompt_function":"legal_summarization","hf_repo":"lighteval\/legal_summarization","hf_subset":"BillSum","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":1024,"metric":["rouge1","rouge2","rougeL","faithfulness","extractiveness","bert_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"legal_summarization:eurlexsum","suite":["helm"],"prompt_function":"legal_summarization","hf_repo":"lighteval\/legal_summarization","hf_subset":"EurLexSum","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["rouge1","rouge2","rougeL","faithfulness","extractiveness","bert_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"legal_summarization:multilexsum","suite":["helm"],"prompt_function":"multilexsum","hf_repo":"lighteval\/legal_summarization","hf_subset":"MultiLexSum","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":256,"metric":["rouge1","rouge2","rougeL","faithfulness","extractiveness","bert_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"legalsupport","suite":["helm"],"prompt_function":"legal_support","hf_repo":"lighteval\/LegalSupport","hf_subset":"default","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["loglikelihood_acc","exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"lexglue:case_hold","suite":["helm","lex_glue_scenario"],"prompt_function":"lex_glue_case_hold","hf_repo":"lighteval\/lexglue","hf_subset":"case_hold","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"lexglue:ecthr_a","suite":["helm","lex_glue_scenario"],"prompt_function":"lex_glue_ecthr_a","hf_repo":"lighteval\/lexglue","hf_subset":"ecthr_a","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"lexglue:ecthr_b","suite":["helm","lex_glue_scenario"],"prompt_function":"lex_glue_ecthr_b","hf_repo":"lighteval\/lexglue","hf_subset":"ecthr_b","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"lexglue:eurlex","suite":["helm","lex_glue_scenario"],"prompt_function":"lex_glue_eurlex","hf_repo":"lighteval\/lexglue","hf_subset":"eurlex","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"lexglue:ledgar","suite":["helm","lex_glue_scenario"],"prompt_function":"lex_glue_ledgar","hf_repo":"lighteval\/lexglue","hf_subset":"ledgar","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"lexglue:scotus","suite":["helm","lex_glue_scenario"],"prompt_function":"lex_glue_scotus","hf_repo":"lighteval\/lexglue","hf_subset":"scotus","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"lexglue:unfair_tos","suite":["helm","lex_glue_scenario"],"prompt_function":"lex_glue_unfair_tos","hf_repo":"lighteval\/lexglue","hf_subset":"unfair_tos","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"lextreme:brazilian_court_decisions_judgment","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_brazilian_court_decisions_judgment","hf_repo":"lighteval\/lextreme","hf_subset":"brazilian_court_decisions_judgment","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"lextreme:brazilian_court_decisions_unanimity","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_brazilian_court_decisions_unanimity","hf_repo":"lighteval\/lextreme","hf_subset":"brazilian_court_decisions_unanimity","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"lextreme:covid19_emergency_event","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_covid19_emergency_event","hf_repo":"lighteval\/lextreme","hf_subset":"covid19_emergency_event","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"lextreme:german_argument_mining","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_german_argument_mining","hf_repo":"lighteval\/lextreme","hf_subset":"german_argument_mining","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"lextreme:greek_legal_code_chapter","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_greek_legal_code_chapter","hf_repo":"lighteval\/lextreme","hf_subset":"greek_legal_code_chapter","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"lextreme:greek_legal_code_subject","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_greek_legal_code_subject","hf_repo":"lighteval\/lextreme","hf_subset":"greek_legal_code_subject","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"lextreme:greek_legal_code_volume","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_greek_legal_code_volume","hf_repo":"lighteval\/lextreme","hf_subset":"greek_legal_code_volume","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","f1_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"lextreme:greek_legal_ner","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_greek_legal_ner","hf_repo":"lighteval\/lextreme","hf_subset":"greek_legal_ner","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":430,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"lextreme:legalnero","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_legalnero","hf_repo":"lighteval\/lextreme","hf_subset":"legalnero","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":788,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"lextreme:lener_br","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_lener_br","hf_repo":"lighteval\/lextreme","hf_subset":"lener_br","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":338,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"lextreme:mapa_coarse","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_mapa_coarse","hf_repo":"lighteval\/lextreme","hf_subset":"mapa_coarse","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":274,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"lextreme:mapa_fine","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_mapa_fine","hf_repo":"lighteval\/lextreme","hf_subset":"mapa_fine","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":274,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"lextreme:multi_eurlex_level_1","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_multi_eurlex_level_1","hf_repo":"lighteval\/lextreme","hf_subset":"multi_eurlex_level_1","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"lextreme:multi_eurlex_level_2","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_multi_eurlex_level_2","hf_repo":"lighteval\/lextreme","hf_subset":"multi_eurlex_level_2","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"lextreme:multi_eurlex_level_3","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_multi_eurlex_level_3","hf_repo":"lighteval\/lextreme","hf_subset":"multi_eurlex_level_3","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"lextreme:online_terms_of_service_clause_topics","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_online_terms_of_service_clause_topics","hf_repo":"lighteval\/lextreme","hf_subset":"online_terms_of_service_clause_topics","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"lextreme:online_terms_of_service_unfairness_levels","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_online_terms_of_service_unfairness_levels","hf_repo":"lighteval\/lextreme","hf_subset":"online_terms_of_service_unfairness_levels","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"lextreme:swiss_judgment_prediction","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_swiss_judgment_prediction","hf_repo":"lighteval\/lextreme","hf_subset":"swiss_judgment_prediction","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"linguistic_mappings","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"linguistic_mappings","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"linguistics_puzzles","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_whitespace_after_query","hf_repo":"bigbench","hf_subset":"linguistics_puzzles","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["bleu","rouge_t5","perfect_exact_match"],"stop_sequence":null,"output_regex":"[^\\.\\?\\!\\;\\n]+", "trust_dataset": true,"version":0} +{"name":"logic_grid_puzzle","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"logic_grid_puzzle","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"logical_args","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"logical_args","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"logical_deduction","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_whitespace_after_query","hf_repo":"bigbench","hf_subset":"logical_deduction","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"logical_fallacy_detection","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"logical_fallacy_detection","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"logical_sequence","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"logical_sequence","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"logiqa","suite":["lighteval"],"prompt_function":"logiqa","hf_repo":"lighteval/logiqa_harness","hf_subset":"logiqa","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"lsat_qa","suite":["helm","lsat_qa_scenario"],"prompt_function":"lsat_qa","hf_repo":"lighteval\/lsat_qa","hf_subset":"all","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"lsat_qa:assignment","suite":["helm","lsat_qa_scenario"],"prompt_function":"lsat_qa","hf_repo":"lighteval\/lsat_qa","hf_subset":"assignment","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"lsat_qa:grouping","suite":["helm","lsat_qa_scenario"],"prompt_function":"lsat_qa","hf_repo":"lighteval\/lsat_qa","hf_subset":"grouping","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"lsat_qa:miscellaneous","suite":["helm","lsat_qa_scenario"],"prompt_function":"lsat_qa","hf_repo":"lighteval\/lsat_qa","hf_subset":"miscellaneous","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"lsat_qa:ordering","suite":["helm","lsat_qa_scenario"],"prompt_function":"lsat_qa","hf_repo":"lighteval\/lsat_qa","hf_subset":"ordering","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"math:algebra","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"algebra","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"math:counting_and_probability","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"counting_and_probability","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"math:geometry","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"geometry","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"math:intermediate_algebra","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"intermediate_algebra","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"math:number_theory","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"number_theory","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"math:prealgebra","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"prealgebra","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"math:precalculus","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"precalculus","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mathematical_induction","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"mathematical_induction","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mathqa","suite":["lighteval"],"prompt_function":"mathqa","hf_repo":"math_qa","hf_subset":"default","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"matrixshapes","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"matrixshapes","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"me_q_sum","suite":["helm"],"prompt_function":"me_q_sum","hf_repo":"lighteval\/me_q_sum","hf_subset":"default","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":128,"metric":["exact_match","quasi_exact_match","f1_score","rougeL","bleu_1","bleu_4"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"med_dialog:healthcaremagic","suite":["helm"],"prompt_function":"med_dialog","hf_repo":"lighteval\/med_dialog","hf_subset":"healthcaremagic","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":128,"metric":["exact_match","quasi_exact_match","f1_score","rougeL","bleu_1","bleu_4"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"med_dialog:icliniq","suite":["helm"],"prompt_function":"med_dialog","hf_repo":"lighteval\/med_dialog","hf_subset":"icliniq","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":128,"metric":["exact_match","quasi_exact_match","f1_score","rougeL","bleu_1","bleu_4"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"med_mcqa","suite":["helm"],"prompt_function":"med_mcqa","hf_repo":"lighteval\/med_mcqa","hf_subset":"default","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["loglikelihood_acc","exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"med_paragraph_simplification","suite":["helm"],"prompt_function":"med_paragraph_simplification","hf_repo":"lighteval\/med_paragraph_simplification","hf_subset":"default","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":512,"metric":["exact_match","quasi_exact_match","f1_score","rougeL","bleu_1","bleu_4"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"med_qa","suite":["helm"],"prompt_function":"med_qa","hf_repo":"bigbio\/med_qa","hf_subset":"med_qa_en_source","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["loglikelihood_acc","exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"metaphor_boolean","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"metaphor_boolean","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"metaphor_understanding","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"metaphor_understanding","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mgsm:en","suite":["lighteval"],"prompt_function":"mgsm_en","hf_repo":"juletxara/mgsm","hf_subset":"en","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match", "quasi_exact_match"],"stop_sequence":["\n", ":", "Question:"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mgsm:es","suite":["lighteval"],"prompt_function":"mgsm_es","hf_repo":"juletxara/mgsm","hf_subset":"es","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match", "quasi_exact_match"],"stop_sequence":["\n", ":", "Pregunta:"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mgsm:fr","suite":["lighteval"],"prompt_function":"mgsm_fr","hf_repo":"juletxara/mgsm","hf_subset":"fr","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match", "quasi_exact_match"],"stop_sequence":["\n", ":", "Question:"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mgsm:de","suite":["lighteval"],"prompt_function":"mgsm_de","hf_repo":"juletxara/mgsm","hf_subset":"de","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match", "quasi_exact_match"],"stop_sequence":["\n", ":", "Frage:"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mgsm:ru","suite":["lighteval"],"prompt_function":"mgsm_ru","hf_repo":"juletxara/mgsm","hf_subset":"ru","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match", "quasi_exact_match"],"stop_sequence":["\n", ":", "\u0417\u0430\u0434\u0430\u0447\u0430:"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mgsm:zh","suite":["lighteval"],"prompt_function":"mgsm_zh","hf_repo":"juletxara/mgsm","hf_subset":"zh","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match", "quasi_exact_match"],"stop_sequence":["\n", ":", "\u95ee\u9898:"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mgsm:ja","suite":["lighteval"],"prompt_function":"mgsm_ja","hf_repo":"juletxara/mgsm","hf_subset":"ja","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match", "quasi_exact_match"],"stop_sequence":["\n", ":", "\u554f\u984c:"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mgsm:th","suite":["lighteval"],"prompt_function":"mgsm_th","hf_repo":"juletxara/mgsm","hf_subset":"th","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match", "quasi_exact_match"],"stop_sequence":["\n", ":", "\u0e42\u0e08\u0e17\u0e22\u0e4c:"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mgsm:sw","suite":["lighteval"],"prompt_function":"mgsm_sw","hf_repo":"juletxara/mgsm","hf_subset":"sw","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match", "quasi_exact_match"],"stop_sequence":["\n", ":", "Swali:"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mgsm:bn","suite":["lighteval"],"prompt_function":"mgsm_bn","hf_repo":"juletxara/mgsm","hf_subset":"bn","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match", "quasi_exact_match"],"stop_sequence":["\n", ":", "\u09aa\u09cd\u09b0\u09b6\u09cd\u09a8:"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mgsm:te","suite":["lighteval"],"prompt_function":"mgsm_te","hf_repo":"juletxara/mgsm","hf_subset":"te","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match", "quasi_exact_match"],"stop_sequence":["\n", ":", "\u0c2a\u0c4d\u0c30\u0c36\u0c4d\u0c28:"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"minute_mysteries_qa","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"minute_mysteries_qa","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","rouge_t5"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"misconceptions","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"misconceptions","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"misconceptions_russian","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"misconceptions_russian","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"all","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu","suite":["original"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"all","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":5,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:abstract_algebra","suite":["original","mmlu"],"prompt_function":"mmlu_abstract_algebra","hf_repo":"cais\/mmlu","hf_subset":"abstract_algebra","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:abstract_algebra","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"abstract_algebra","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:abstract_algebra","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"abstract_algebra","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:anatomy","suite":["original","mmlu"],"prompt_function":"mmlu_anatomy","hf_repo":"cais\/mmlu","hf_subset":"anatomy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:anatomy","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"anatomy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:anatomy","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"anatomy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:astronomy","suite":["original","mmlu"],"prompt_function":"mmlu_astronomy","hf_repo":"cais\/mmlu","hf_subset":"astronomy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:astronomy","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"astronomy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:astronomy","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"astronomy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:business_ethics","suite":["original","mmlu"],"prompt_function":"mmlu_business_ethics","hf_repo":"cais\/mmlu","hf_subset":"business_ethics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:business_ethics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"business_ethics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:business_ethics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"business_ethics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:clinical_knowledge","suite":["original","mmlu"],"prompt_function":"mmlu_clinical_knowledge","hf_repo":"cais\/mmlu","hf_subset":"clinical_knowledge","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:clinical_knowledge","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"clinical_knowledge","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:clinical_knowledge","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"clinical_knowledge","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:college_biology","suite":["original","mmlu"],"prompt_function":"mmlu_college_biology","hf_repo":"cais\/mmlu","hf_subset":"college_biology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:college_biology","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"college_biology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:college_biology","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"college_biology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:college_chemistry","suite":["original","mmlu"],"prompt_function":"mmlu_college_chemistry","hf_repo":"cais\/mmlu","hf_subset":"college_chemistry","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:college_chemistry","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"college_chemistry","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:college_chemistry","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"college_chemistry","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:college_computer_science","suite":["original","mmlu"],"prompt_function":"mmlu_college_computer_science","hf_repo":"cais\/mmlu","hf_subset":"college_computer_science","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:college_computer_science","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"college_computer_science","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:college_computer_science","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"college_computer_science","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:college_mathematics","suite":["original","mmlu"],"prompt_function":"mmlu_college_mathematics","hf_repo":"cais\/mmlu","hf_subset":"college_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:college_mathematics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"college_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:college_mathematics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"college_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:college_medicine","suite":["original","mmlu"],"prompt_function":"mmlu_college_medicine","hf_repo":"cais\/mmlu","hf_subset":"college_medicine","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:college_medicine","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"college_medicine","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:college_medicine","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"college_medicine","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:college_physics","suite":["original","mmlu"],"prompt_function":"mmlu_college_physics","hf_repo":"cais\/mmlu","hf_subset":"college_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:college_physics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"college_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:college_physics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"college_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:computer_security","suite":["original","mmlu"],"prompt_function":"mmlu_computer_security","hf_repo":"cais\/mmlu","hf_subset":"computer_security","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:computer_security","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"computer_security","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:computer_security","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"computer_security","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:conceptual_physics","suite":["original","mmlu"],"prompt_function":"mmlu_conceptual_physics","hf_repo":"cais\/mmlu","hf_subset":"conceptual_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:conceptual_physics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"conceptual_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:conceptual_physics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"conceptual_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:econometrics","suite":["original","mmlu"],"prompt_function":"mmlu_econometrics","hf_repo":"cais\/mmlu","hf_subset":"econometrics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:econometrics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"econometrics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:econometrics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"econometrics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:electrical_engineering","suite":["original","mmlu"],"prompt_function":"mmlu_electrical_engineering","hf_repo":"cais\/mmlu","hf_subset":"electrical_engineering","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:electrical_engineering","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"electrical_engineering","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:electrical_engineering","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"electrical_engineering","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:elementary_mathematics","suite":["original","mmlu"],"prompt_function":"mmlu_elementary_mathematics","hf_repo":"cais\/mmlu","hf_subset":"elementary_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:elementary_mathematics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"elementary_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:elementary_mathematics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"elementary_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:formal_logic","suite":["original","mmlu"],"prompt_function":"mmlu_formal_logic","hf_repo":"cais\/mmlu","hf_subset":"formal_logic","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:formal_logic","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"formal_logic","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:formal_logic","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"formal_logic","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:global_facts","suite":["original","mmlu"],"prompt_function":"mmlu_global_facts","hf_repo":"cais\/mmlu","hf_subset":"global_facts","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:global_facts","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"global_facts","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:global_facts","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"global_facts","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:high_school_biology","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_biology","hf_repo":"cais\/mmlu","hf_subset":"high_school_biology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:high_school_biology","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_biology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:high_school_biology","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_biology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:high_school_chemistry","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_chemistry","hf_repo":"cais\/mmlu","hf_subset":"high_school_chemistry","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:high_school_chemistry","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_chemistry","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:high_school_chemistry","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_chemistry","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:high_school_computer_science","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_computer_science","hf_repo":"cais\/mmlu","hf_subset":"high_school_computer_science","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:high_school_computer_science","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_computer_science","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:high_school_computer_science","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_computer_science","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:high_school_european_history","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_european_history","hf_repo":"cais\/mmlu","hf_subset":"high_school_european_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:high_school_european_history","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_european_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:high_school_european_history","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_european_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:high_school_geography","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_geography","hf_repo":"cais\/mmlu","hf_subset":"high_school_geography","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:high_school_geography","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_geography","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:high_school_geography","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_geography","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:high_school_government_and_politics","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_government_and_politics","hf_repo":"cais\/mmlu","hf_subset":"high_school_government_and_politics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:high_school_government_and_politics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_government_and_politics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:high_school_government_and_politics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_government_and_politics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:high_school_macroeconomics","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_macroeconomics","hf_repo":"cais\/mmlu","hf_subset":"high_school_macroeconomics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:high_school_macroeconomics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_macroeconomics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:high_school_macroeconomics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_macroeconomics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:high_school_mathematics","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_mathematics","hf_repo":"cais\/mmlu","hf_subset":"high_school_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:high_school_mathematics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:high_school_mathematics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:high_school_microeconomics","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_microeconomics","hf_repo":"cais\/mmlu","hf_subset":"high_school_microeconomics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:high_school_microeconomics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_microeconomics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:high_school_microeconomics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_microeconomics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:high_school_physics","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_physics","hf_repo":"cais\/mmlu","hf_subset":"high_school_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:high_school_physics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:high_school_physics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:high_school_psychology","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_psychology","hf_repo":"cais\/mmlu","hf_subset":"high_school_psychology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:high_school_psychology","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_psychology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:high_school_psychology","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_psychology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:high_school_statistics","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_statistics","hf_repo":"cais\/mmlu","hf_subset":"high_school_statistics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:high_school_statistics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_statistics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:high_school_statistics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_statistics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:high_school_us_history","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_us_history","hf_repo":"cais\/mmlu","hf_subset":"high_school_us_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:high_school_us_history","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_us_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:high_school_us_history","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_us_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:high_school_world_history","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_world_history","hf_repo":"cais\/mmlu","hf_subset":"high_school_world_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:high_school_world_history","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_world_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:high_school_world_history","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_world_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:human_aging","suite":["original","mmlu"],"prompt_function":"mmlu_human_aging","hf_repo":"cais\/mmlu","hf_subset":"human_aging","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:human_aging","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"human_aging","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:human_aging","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"human_aging","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:human_sexuality","suite":["original","mmlu"],"prompt_function":"mmlu_human_sexuality","hf_repo":"cais\/mmlu","hf_subset":"human_sexuality","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:human_sexuality","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"human_sexuality","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:human_sexuality","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"human_sexuality","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:international_law","suite":["original","mmlu"],"prompt_function":"mmlu_international_law","hf_repo":"cais\/mmlu","hf_subset":"international_law","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:international_law","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"international_law","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:international_law","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"international_law","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:jurisprudence","suite":["original","mmlu"],"prompt_function":"mmlu_jurisprudence","hf_repo":"cais\/mmlu","hf_subset":"jurisprudence","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:jurisprudence","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"jurisprudence","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:jurisprudence","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"jurisprudence","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:logical_fallacies","suite":["original","mmlu"],"prompt_function":"mmlu_logical_fallacies","hf_repo":"cais\/mmlu","hf_subset":"logical_fallacies","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:logical_fallacies","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"logical_fallacies","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:logical_fallacies","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"logical_fallacies","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:machine_learning","suite":["original","mmlu"],"prompt_function":"mmlu_machine_learning","hf_repo":"cais\/mmlu","hf_subset":"machine_learning","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:machine_learning","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"machine_learning","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:machine_learning","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"machine_learning","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:management","suite":["original","mmlu"],"prompt_function":"mmlu_management","hf_repo":"cais\/mmlu","hf_subset":"management","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:management","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"management","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:management","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"management","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:marketing","suite":["original","mmlu"],"prompt_function":"mmlu_marketing","hf_repo":"cais\/mmlu","hf_subset":"marketing","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:marketing","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"marketing","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:marketing","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"marketing","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:medical_genetics","suite":["original","mmlu"],"prompt_function":"mmlu_medical_genetics","hf_repo":"cais\/mmlu","hf_subset":"medical_genetics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:medical_genetics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"medical_genetics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:medical_genetics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"medical_genetics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:miscellaneous","suite":["original","mmlu"],"prompt_function":"mmlu_miscellaneous","hf_repo":"cais\/mmlu","hf_subset":"miscellaneous","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:miscellaneous","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"miscellaneous","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:miscellaneous","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"miscellaneous","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:moral_disputes","suite":["original","mmlu"],"prompt_function":"mmlu_moral_disputes","hf_repo":"cais\/mmlu","hf_subset":"moral_disputes","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:moral_disputes","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"moral_disputes","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:moral_disputes","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"moral_disputes","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:moral_scenarios","suite":["original","mmlu"],"prompt_function":"mmlu_moral_scenarios","hf_repo":"cais\/mmlu","hf_subset":"moral_scenarios","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:moral_scenarios","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"moral_scenarios","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:moral_scenarios","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"moral_scenarios","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:nutrition","suite":["original","mmlu"],"prompt_function":"mmlu_nutrition","hf_repo":"cais\/mmlu","hf_subset":"nutrition","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:nutrition","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"nutrition","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:nutrition","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"nutrition","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:philosophy","suite":["original","mmlu"],"prompt_function":"mmlu_philosophy","hf_repo":"cais\/mmlu","hf_subset":"philosophy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:philosophy","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"philosophy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:philosophy","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"philosophy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:prehistory","suite":["original","mmlu"],"prompt_function":"mmlu_prehistory","hf_repo":"cais\/mmlu","hf_subset":"prehistory","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:prehistory","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"prehistory","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:prehistory","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"prehistory","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:professional_accounting","suite":["original","mmlu"],"prompt_function":"mmlu_professional_accounting","hf_repo":"cais\/mmlu","hf_subset":"professional_accounting","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:professional_accounting","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"professional_accounting","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:professional_accounting","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"professional_accounting","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:professional_law","suite":["original","mmlu"],"prompt_function":"mmlu_professional_law","hf_repo":"cais\/mmlu","hf_subset":"professional_law","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:professional_law","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"professional_law","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:professional_law","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"professional_law","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:professional_medicine","suite":["original","mmlu"],"prompt_function":"mmlu_professional_medicine","hf_repo":"cais\/mmlu","hf_subset":"professional_medicine","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:professional_medicine","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"professional_medicine","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:professional_medicine","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"professional_medicine","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:professional_psychology","suite":["original","mmlu"],"prompt_function":"mmlu_professional_psychology","hf_repo":"cais\/mmlu","hf_subset":"professional_psychology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:professional_psychology","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"professional_psychology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:professional_psychology","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"professional_psychology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:public_relations","suite":["original","mmlu"],"prompt_function":"mmlu_public_relations","hf_repo":"cais\/mmlu","hf_subset":"public_relations","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:public_relations","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"public_relations","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:public_relations","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"public_relations","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:security_studies","suite":["original","mmlu"],"prompt_function":"mmlu_security_studies","hf_repo":"cais\/mmlu","hf_subset":"security_studies","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:security_studies","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"security_studies","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:security_studies","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"security_studies","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:sociology","suite":["original","mmlu"],"prompt_function":"mmlu_sociology","hf_repo":"cais\/mmlu","hf_subset":"sociology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:sociology","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"sociology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:sociology","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"sociology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:us_foreign_policy","suite":["original","mmlu"],"prompt_function":"mmlu_us_foreign_policy","hf_repo":"cais\/mmlu","hf_subset":"us_foreign_policy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:us_foreign_policy","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"us_foreign_policy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:us_foreign_policy","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"us_foreign_policy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:virology","suite":["original","mmlu"],"prompt_function":"mmlu_virology","hf_repo":"cais\/mmlu","hf_subset":"virology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:virology","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"virology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:virology","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"virology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:world_religions","suite":["original","mmlu"],"prompt_function":"mmlu_world_religions","hf_repo":"cais\/mmlu","hf_subset":"world_religions","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:world_religions","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"world_religions","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mmlu:world_religions","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"world_religions","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mnist_ascii","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"mnist_ascii","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"modified_arithmetic","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"modified_arithmetic","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"moral_permissibility","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"moral_permissibility","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"movie_dialog_same_or_different","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"movie_dialog_same_or_different","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"movie_recommendation","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"movie_recommendation","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mtnt2019:en-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"mtnt2019_en-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":200,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mtnt2019:en-ja","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"mtnt2019_en-ja","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":200,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mtnt2019:fr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"mtnt2019_fr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":200,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mtnt2019:ja-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"mtnt2019_ja-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":200,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mult_data_wrangling","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"mult_data_wrangling","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"multiemo","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"multiemo","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mutual","suite":["lighteval"],"prompt_function":"mutual","hf_repo":"lighteval\/mutual_harness","hf_subset":"mutual","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["recall_at_1","recall_at_2","mrr"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"mutual_plus","suite":["lighteval"],"prompt_function":"mutual","hf_repo":"lighteval\/mutual_harness","hf_subset":"mutual_plus","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["recall_at_1","recall_at_2","mrr"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"narrativeqa","suite":["helm","helm_general"],"prompt_function":"narrativeqa","hf_repo":"lighteval/narrative_qa_helm","hf_subset":"default","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","f1_score","rougeL","bleu_1","bleu_4"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"natural_instructions","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"natural_instructions","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","rouge_t5"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"navigate","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"navigate","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"nonsense_words_grammar","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"nonsense_words_grammar","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"novel_concepts","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"novel_concepts","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"numeracy:linear_example","suite":["helm"],"prompt_function":"numeracy","hf_repo":"lighteval\/numeracy","hf_subset":"linear_example","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"numeracy:linear_standard","suite":["helm"],"prompt_function":"numeracy","hf_repo":"lighteval\/numeracy","hf_subset":"linear_standard","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"numeracy:parabola_example","suite":["helm"],"prompt_function":"numeracy","hf_repo":"lighteval\/numeracy","hf_subset":"parabola_example","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"numeracy:parabola_standard","suite":["helm"],"prompt_function":"numeracy","hf_repo":"lighteval\/numeracy","hf_subset":"parabola_standard","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"numeracy:paraboloid_example","suite":["helm"],"prompt_function":"numeracy","hf_repo":"lighteval\/numeracy","hf_subset":"paraboloid_example","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"numeracy:paraboloid_standard","suite":["helm"],"prompt_function":"numeracy","hf_repo":"lighteval\/numeracy","hf_subset":"paraboloid_standard","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"numeracy:plane_example","suite":["helm"],"prompt_function":"numeracy","hf_repo":"lighteval\/numeracy","hf_subset":"plane_example","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"numeracy:plane_standard","suite":["helm"],"prompt_function":"numeracy","hf_repo":"lighteval\/numeracy","hf_subset":"plane_standard","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"object_counting","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"object_counting","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"odd_one_out","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"odd_one_out","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"openbookqa","suite":["helm","commonsense_scenario","helm_general"],"prompt_function":"openbookqa_helm","hf_repo":"openbookqa","hf_subset":"main","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"openbookqa","suite":["lighteval"],"prompt_function":"openbookqa","hf_repo":"openbookqa","hf_subset":"main","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"operators","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_whitespace_after_query","hf_repo":"bigbench","hf_subset":"operators","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":"([-+]?\\d+)[.]{0,1}$", "trust_dataset": true,"version":0} +{"name":"paragraph_segmentation","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"paragraph_segmentation","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"parsinlu_qa","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"parsinlu_qa","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"parsinlu_reading_comprehension","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_whitespace_after_query","hf_repo":"bigbench","hf_subset":"parsinlu_reading_comprehension","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["perfect_exact_match"],"stop_sequence":null,"output_regex":"[^\\.\\?\\!\\;\\n]+", "trust_dataset": true,"version":0} +{"name":"penguins_in_a_table","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"penguins_in_a_table","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"periodic_elements","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"periodic_elements","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"persian_idioms","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"persian_idioms","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"phrase_relatedness","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"phrase_relatedness","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"physical_intuition","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"physical_intuition","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"physics","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"physics","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"physics_questions","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"physics_questions","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"piqa","suite":["lighteval"],"prompt_function":"piqa_harness","hf_repo":"piqa","hf_subset":"plain_text","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"piqa","suite":["helm","commonsense_scenario"],"prompt_function":"piqa_helm","hf_repo":"piqa","hf_subset":"plain_text","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"play_dialog_same_or_different","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_whitespace_after_query","hf_repo":"bigbench","hf_subset":"play_dialog_same_or_different","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"polish_sequence_labeling","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"polish_sequence_labeling","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["f1_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"presuppositions_as_nli","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"presuppositions_as_nli","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"prost","suite":["lighteval"],"prompt_function":"prost","hf_repo":"corypaik\/prost","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"pubmedqa","suite":["lighteval"],"prompt_function":"pubmed_qa","hf_repo":"pubmed_qa","hf_subset":"pqa_labeled","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"pubmedqa","suite":["helm"],"prompt_function":"pubmed_qa_helm","hf_repo":"pubmed_qa","hf_subset":"pqa_labeled","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"qa4mre:2011","suite":["lighteval"],"prompt_function":"qa4mre","hf_repo":"qa4mre","hf_subset":"2011.main.EN","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"qa4mre:2012","suite":["lighteval"],"prompt_function":"qa4mre","hf_repo":"qa4mre","hf_subset":"2012.main.EN","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"qa4mre:2013","suite":["lighteval"],"prompt_function":"qa4mre","hf_repo":"qa4mre","hf_subset":"2013.main.EN","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"qa_wikidata","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"qa_wikidata","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleurt","bleu","rouge_t5","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"qasper","suite":["lighteval"],"prompt_function":"qasper","hf_repo":"qasper","hf_subset":"qasper","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["f1_score_quasi"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"qasper_ll","suite":["lighteval"],"prompt_function":"qasper_ll","hf_repo":"qasper","hf_subset":"qasper","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"quac","suite":["helm"],"prompt_function":"quac","hf_repo":"lighteval/quac_helm","hf_subset":"default","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match", "quasi_exact_match", "f1_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"question_selection","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"question_selection","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"race:high","suite":["lighteval","race"],"prompt_function":"race","hf_repo":"EleutherAI/race","hf_subset":"high","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"raft:ade_corpus_v2","suite":["helm","helm_general"],"prompt_function":"raft_ade_corpus_v2","hf_repo":"ought\/raft","hf_subset":"ade_corpus_v2","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":30,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"raft:banking_77","suite":["helm","helm_general"],"prompt_function":"raft_banking_77","hf_repo":"ought\/raft","hf_subset":"banking_77","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":30,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"raft:neurips_impact_statement_risks","suite":["helm","helm_general"],"prompt_function":"raft_neurips_impact_statement_risks","hf_repo":"ought\/raft","hf_subset":"neurips_impact_statement_risks","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":30,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"raft:one_stop_english","suite":["helm","helm_general"],"prompt_function":"raft_one_stop_english","hf_repo":"ought\/raft","hf_subset":"one_stop_english","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":30,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"raft:overruling","suite":["helm","helm_general"],"prompt_function":"raft_overruling","hf_repo":"ought\/raft","hf_subset":"overruling","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":30,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"raft:semiconductor_org_types","suite":["helm","helm_general"],"prompt_function":"raft_semiconductor_org_types","hf_repo":"ought\/raft","hf_subset":"semiconductor_org_types","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":30,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"raft:systematic_review_inclusion","suite":["helm","helm_general"],"prompt_function":"raft_systematic_review_inclusion","hf_repo":"ought\/raft","hf_subset":"systematic_review_inclusion","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":30,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"raft:tai_safety_research","suite":["helm","helm_general"],"prompt_function":"raft_tai_safety_research","hf_repo":"ought\/raft","hf_subset":"tai_safety_research","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":30,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"raft:terms_of_service","suite":["helm","helm_general"],"prompt_function":"raft_terms_of_service","hf_repo":"ought\/raft","hf_subset":"terms_of_service","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":30,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"raft:tweet_eval_hate","suite":["helm","helm_general"],"prompt_function":"raft_tweet_eval_hate","hf_repo":"ought\/raft","hf_subset":"tweet_eval_hate","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":30,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"raft:twitter_complaints","suite":["helm","helm_general"],"prompt_function":"raft_twitter_complaints","hf_repo":"ought\/raft","hf_subset":"twitter_complaints","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":30,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"real_or_fake_text","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"real_or_fake_text","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"real_toxicity_prompts","suite":["helm"],"prompt_function":"real_toxicity_prompts","hf_repo":"allenai\/real-toxicity-prompts","hf_subset":"default","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["prediction_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"reasoning_about_colored_objects","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"reasoning_about_colored_objects","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"repeat_copy_logic","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_whitespace_after_query","hf_repo":"bigbench","hf_subset":"repeat_copy_logic","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"rephrase","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"rephrase","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["rouge_t5","bleu","loglikelihood_acc","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"rhyming","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"rhyming","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"riddle_sense","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"riddle_sense","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"ruin_names","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"ruin_names","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"salient_translation_error_detection","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"salient_translation_error_detection","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"scientific_press_release","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"scientific_press_release","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"sciq","suite":["lighteval"],"prompt_function":"sciq","hf_repo":"sciq","hf_subset":"default","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"semantic_parsing_in_context_sparc","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"semantic_parsing_in_context_sparc","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","rouge_t5","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"semantic_parsing_spider","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"semantic_parsing_spider","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","rouge_t5","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"sentence_ambiguity","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"sentence_ambiguity","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"similarities_abstraction","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"similarities_abstraction","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","rouge_t5","loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"simp_turing_concept","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"simp_turing_concept","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"simple_arithmetic_json","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"simple_arithmetic_json","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"simple_arithmetic_json_multiple_choice","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"simple_arithmetic_json_multiple_choice","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"simple_arithmetic_json_subtasks","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"simple_arithmetic_json_subtasks","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"simple_arithmetic_multiple_targets_json","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"simple_arithmetic_multiple_targets_json","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"simple_ethical_questions","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"simple_ethical_questions","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"simple_text_editing","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"simple_text_editing","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"siqa","suite":["helm","commonsense_scenario"],"prompt_function":"siqa","hf_repo":"social_i_qa","hf_subset":"default","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"snarks","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"snarks","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"social_iqa","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"social_iqa","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"social_support","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"social_support","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["f1_score_macro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"sports_understanding","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"sports_understanding","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"storycloze:2016","suite":["lighteval","storycloze"],"prompt_function":"storycloze","hf_repo":"story_cloze","hf_subset":"2016","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"storycloze:2018","suite":["lighteval","storycloze"],"prompt_function":"storycloze","hf_repo":"story_cloze","hf_subset":"2018","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"strange_stories","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_whitespace_after_query","hf_repo":"bigbench","hf_subset":"strange_stories","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"strategyqa","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_whitespace_after_query","hf_repo":"bigbench","hf_subset":"strategyqa","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","rouge_t5","loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"sufficient_information","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"sufficient_information","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"suicide_risk","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"suicide_risk","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"summarization:cnn-dm","suite":["helm","helm_general"],"prompt_function":"cnn_dm","hf_repo":"lighteval\/summarization","hf_subset":"cnn-dm","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":128,"metric":["rouge1","rouge2","rougeL","faithfulness","extractiveness","bert_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"summarization:xsum","suite":["helm","helm_general"],"prompt_function":"xsum","hf_repo":"lighteval\/summarization","hf_subset":"xsum","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":64,"metric":["rouge1","rouge2","rougeL","faithfulness","extractiveness","bert_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"summarization:xsum-sampled","suite":["helm"],"prompt_function":"xsum","hf_repo":"lighteval\/summarization","hf_subset":"xsum-sampled","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":64,"metric":["rouge1","rouge2","rougeL","faithfulness","extractiveness","bert_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"super_glue:boolq","suite":["lighteval","superglue"],"prompt_function":"boolq_harness","hf_repo":"super_glue","hf_subset":"boolq","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"super_glue:cb","suite":["lighteval","superglue"],"prompt_function":"cb","hf_repo":"super_glue","hf_subset":"cb","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc_single_token", "multi_f1_numeric"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"super_glue:copa","suite":["lighteval","superglue"],"prompt_function":"copa","hf_repo":"super_glue","hf_subset":"copa","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"super_glue:rte","suite":["lighteval","superglue"],"prompt_function":"rte","hf_repo":"super_glue","hf_subset":"rte","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"super_glue:multirc","suite":["lighteval","superglue"],"prompt_function":"multirc","hf_repo":"super_glue","hf_subset":"multirc","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"super_glue:wic","suite":["lighteval","superglue"],"prompt_function":"wic","hf_repo":"super_glue","hf_subset":"wic","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"super_glue:wsc","suite":["lighteval","superglue"],"prompt_function":"wsc","hf_repo":"super_glue","hf_subset":"wsc","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"swahili_english_proverbs","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"swahili_english_proverbs","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"swag","suite":["lighteval"],"prompt_function":"swag","hf_repo":"swag","hf_subset":"regular","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"swedish_to_german_proverbs","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"swedish_to_german_proverbs","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"symbol_interpretation","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_whitespace_after_query","hf_repo":"bigbench","hf_subset":"symbol_interpretation","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"synthetic_reasoning:induction","suite":["helm"],"prompt_function":"synthetic_reasoning","hf_repo":"lighteval\/synthetic_reasoning","hf_subset":"induction","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":50,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"synthetic_reasoning:natural_easy","suite":["helm"],"prompt_function":"synthetic_reasoning_natural","hf_repo":"lighteval\/synthetic_reasoning_natural","hf_subset":"easy","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","f1_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"synthetic_reasoning:natural_hard","suite":["helm"],"prompt_function":"synthetic_reasoning_natural","hf_repo":"lighteval\/synthetic_reasoning_natural","hf_subset":"hard","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","f1_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"synthetic_reasoning:pattern_match","suite":["helm"],"prompt_function":"synthetic_reasoning","hf_repo":"lighteval\/synthetic_reasoning","hf_subset":"pattern_match","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":50,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"synthetic_reasoning:variable_substitution","suite":["helm"],"prompt_function":"synthetic_reasoning","hf_repo":"lighteval\/synthetic_reasoning","hf_subset":"variable_substitution","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":50,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"tellmewhy","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"tellmewhy","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","rouge_t5"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"temporal_sequences","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"temporal_sequences","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"tense","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"tense","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"the_pile:arxiv","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_arxiv","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"the_pile:arxiv","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"arxiv","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"the_pile:bibliotik","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"bibliotik","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"the_pile:bookcorpus2","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_bookcorpus2","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"the_pile:books3","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_books3","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"the_pile:commoncrawl","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"commoncrawl","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"the_pile:dm-mathematics","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_dm-mathematics","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"the_pile:dm-mathematics","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"dm-mathematics","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"the_pile:enron","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_enron","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"the_pile:enron","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"enron","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"the_pile:europarl","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_europarl","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"the_pile:europarl","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"europarl","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"the_pile:freelaw","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_freelaw","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"the_pile:freelaw","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"freelaw","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"the_pile:github","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_github","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"the_pile:github","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"github","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"the_pile:gutenberg","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_gutenberg","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"the_pile:gutenberg","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"gutenberg","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"the_pile:hackernews","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_hackernews","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"the_pile:hackernews","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"hackernews","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"the_pile:nih-exporter","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_nih-exporter","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"the_pile:nih-exporter","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"nih-exporter","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"the_pile:opensubtitles","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_opensubtitles","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"the_pile:opensubtitles","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"opensubtitles","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"the_pile:openwebtext2","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_openwebtext2","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"the_pile:openwebtext2","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"openwebtext2","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"the_pile:philpapers","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_philpapers","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"the_pile:pile-cc","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_pile-cc","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"the_pile:pubmed-abstracts","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_pubmed-abstracts","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"the_pile:pubmed-abstracts","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"pubmed-abstracts","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"the_pile:pubmed-central","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_pubmed-central","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"the_pile:pubmed-central","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"pubmed-central","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"the_pile:stackexchange","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_stackexchange","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"the_pile:stackexchange","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"stackexchange","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"the_pile:ubuntu-irc","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_ubuntu-irc","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"the_pile:uspto","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_upsto","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"the_pile:upsto","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"uspto","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"the_pile:wikipedia","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_wikipedia","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"the_pile:wikipedia","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"wikipedia","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"the_pile:youtubesubtitles","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_youtubesubtitles","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"the_pile:youtubesubtitles","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"youtubesubtitles","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"timedial","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"timedial","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"toxigen","suite":["lighteval"],"prompt_function":"toxigen","hf_repo":"skg/toxigen-data","hf_subset":"annotated","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"topical_chat","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"topical_chat","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","rouge_t5","loglikelihood_acc","bleurt"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"tracking_shuffled_objects","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"tracking_shuffled_objects","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"triviaqa","suite":["lighteval"],"prompt_function":"triviaqa","hf_repo":"trivia_qa","hf_subset":"rc.nocontext","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["quasi_exact_match_triviaqa"],"stop_sequence":["\n", ".", ","],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"truthfulqa:gen","suite":["lighteval"],"prompt_function":"truthful_qa_generative","hf_repo":"truthful_qa","hf_subset":"generation","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":200,"metric":["bleu","rouge_t5"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"truthfulqa:mc","suite":["leaderboard"],"prompt_function":"truthful_qa_multiple_choice","hf_repo":"truthful_qa","hf_subset":"multiple_choice","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["truthfulqa_mc_metrics"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"truthfulqa","suite":["helm","helm_general"],"prompt_function":"truthful_qa_helm","hf_repo":"lighteval\/truthfulqa_helm","hf_subset":"default","hf_avail_splits":["train","valid"],"evaluation_splits":["valid"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["loglikelihood_acc","exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"twitterAAE:aa","suite":["helm"],"prompt_function":"twitter_aae","hf_repo":"lighteval\/twitterAAE","hf_subset":"aa","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"twitterAAE:white","suite":["helm"],"prompt_function":"twitter_aae","hf_repo":"lighteval\/twitterAAE","hf_subset":"white","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"understanding_fables","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"understanding_fables","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"undo_permutation","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"undo_permutation","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"unit_conversion","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"unit_conversion","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"unit_interpretation","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"unit_interpretation","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"unnatural_in_context_learning","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"unnatural_in_context_learning","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"unscramble:anagrams1","suite":["lighteval","unscramble"],"prompt_function":"unscramble","hf_repo":"lighteval\/GPT3_unscramble","hf_subset":"default","hf_avail_splits":["mid_word_1_anagrams"],"evaluation_splits":["mid_word_1_anagrams"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"unscramble:anagrams2","suite":["lighteval","unscramble"],"prompt_function":"unscramble","hf_repo":"lighteval\/GPT3_unscramble","hf_subset":"default","hf_avail_splits":["mid_word_2_anagrams"],"evaluation_splits":["mid_word_2_anagrams"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"unscramble:cycle_letters","suite":["lighteval","unscramble"],"prompt_function":"unscramble","hf_repo":"lighteval\/GPT3_unscramble","hf_subset":"default","hf_avail_splits":["cycle_letters_in_word"],"evaluation_splits":["cycle_letters_in_word"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"unscramble:random_insertion","suite":["lighteval","unscramble"],"prompt_function":"unscramble","hf_repo":"lighteval\/GPT3_unscramble","hf_subset":"default","hf_avail_splits":["random_insertion_in_word"],"evaluation_splits":["random_insertion_in_word"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"unscramble:reversed_words","suite":["lighteval","unscramble"],"prompt_function":"unscramble","hf_repo":"lighteval\/GPT3_unscramble","hf_subset":"default","hf_avail_splits":["reversed_words"],"evaluation_splits":["reversed_words"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"vitaminc_fact_verification","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_whitespace_after_query","hf_repo":"bigbench","hf_subset":"vitaminc_fact_verification","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"webqs","suite":["lighteval"],"prompt_function":"webqs","hf_repo":"web_questions","hf_subset":"default","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"what_is_the_tao","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"what_is_the_tao","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"which_wiki_edit","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"which_wiki_edit","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:applies_to_jurisdiction","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"applies_to_jurisdiction","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:atomic_number","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"atomic_number","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:author","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"author","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:award_received","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"award_received","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:basic_form_of_government","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"basic_form_of_government","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:capital","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"capital","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:capital_of","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"capital_of","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:central_bank","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"central_bank","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:composer","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"composer","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:continent","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"continent","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:country","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"country","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:country_of_citizenship","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"country_of_citizenship","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:country_of_origin","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"country_of_origin","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:creator","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"creator","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:currency","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"currency","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:defendant","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"defendant","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:developer","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"developer","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:diplomatic_relation","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"diplomatic_relation","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:director","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"director","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:discoverer_or_inventor","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"discoverer_or_inventor","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:drug_or_therapy_used_for_treatment","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"drug_or_therapy_used_for_treatment","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:educated_at","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"educated_at","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:electron_configuration","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"electron_configuration","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:employer","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"employer","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:field_of_work","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"field_of_work","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:file_extension","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"file_extension","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:genetic_association","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"genetic_association","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:genre","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"genre","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:has_part","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"has_part","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:head_of_government","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"head_of_government","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:head_of_state","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"head_of_state","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:headquarters_location","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"headquarters_location","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:industry","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"industry","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:influenced_by","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"influenced_by","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:instance_of","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"instance_of","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:instrument","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"instrument","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:language_of_work_or_name","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"language_of_work_or_name","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:languages_spoken_written_or_signed","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"languages_spoken_written_or_signed","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:laws_applied","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"laws_applied","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:located_in_the_administrative_territorial_entity","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"located_in_the_administrative_territorial_entity","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:location","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"location","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:location_of_discovery","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"location_of_discovery","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:location_of_formation","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"location_of_formation","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:majority_opinion_by","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"majority_opinion_by","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:manufacturer","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"manufacturer","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:measured_physical_quantity","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"measured_physical_quantity","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:medical_condition_treated","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"medical_condition_treated","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:member_of","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"member_of","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:member_of_political_party","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"member_of_political_party","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:member_of_sports_team","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"member_of_sports_team","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:movement","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"movement","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:named_after","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"named_after","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:native_language","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"native_language","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:number_of_processor_cores","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"number_of_processor_cores","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:occupation","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"occupation","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:office_held_by_head_of_government","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"office_held_by_head_of_government","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:office_held_by_head_of_state","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"office_held_by_head_of_state","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:official_language","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"official_language","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:operating_system","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"operating_system","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:original_language_of_film_or_TV_show","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"original_language_of_film_or_TV_show","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:original_network","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"original_network","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:overrules","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"overrules","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:owned_by","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"owned_by","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:part_of","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"part_of","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:participating_team","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"participating_team","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:place_of_birth","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"place_of_birth","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:place_of_death","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"place_of_death","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:plaintiff","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"plaintiff","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:position_held","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"position_held","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:position_played_on_team","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"position_played_on_team","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:programming_language","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"programming_language","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:recommended_unit_of_measurement","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"recommended_unit_of_measurement","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:record_label","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"record_label","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:religion","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"religion","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:repealed_by","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"repealed_by","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:shares_border_with","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"shares_border_with","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:solved_by","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"solved_by","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:statement_describes","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"statement_describes","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:stock_exchange","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"stock_exchange","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:subclass_of","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"subclass_of","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:subsidiary","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"subsidiary","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:symptoms_and_signs","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"symptoms_and_signs","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:therapeutic_area","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"therapeutic_area","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:time_of_discovery_or_invention","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"time_of_discovery_or_invention","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:twinned_administrative_body","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"twinned_administrative_body","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikifact:work_location","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"work_location","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikitext:2","suite":["lighteval"],"prompt_function":"wikitext","hf_repo":"wikitext","hf_subset":"wikitext-2-raw-v1","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikitext:103:document_level","suite":["harness"],"prompt_function":"wikitext_harness","hf_repo":"EleutherAI\/wikitext_document_level","hf_subset":"wikitext-103-raw-v1","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wikitext:103:document_level","suite":["helm"],"prompt_function":"wikitext_helm","hf_repo":"EleutherAI\/wikitext_document_level","hf_subset":"wikitext-103-raw-v1","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wino_x_german","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"wino_x_german","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"winogrande","suite":["leaderboard"],"prompt_function":"winogrande","hf_repo":"winogrande","hf_subset":"winogrande_xl","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"winowhy","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_whitespace_after_query","hf_repo":"bigbench","hf_subset":"winowhy","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt08:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt08_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt08:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt08_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt08:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt08_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt08:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt08_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt08:en-es","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt08_en-es","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt08:en-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt08_en-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt08:en-hu","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt08_en-hu","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt08:es-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt08_es-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt08:fr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt08_fr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt08:hu-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt08_hu-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt09:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt09:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt09:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt09:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt09:en-es","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_en-es","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt09:en-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_en-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt09:en-hu","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_en-hu","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt09:en-it","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_en-it","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt09:es-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_es-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt09:fr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_fr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt09:hu-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_hu-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt09:it-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_it-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt10:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt10_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt10:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt10_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt10:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt10_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt10:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt10_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt10:en-es","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt10_en-es","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt10:en-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt10_en-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt10:es-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt10_es-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt10:fr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt10_fr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt11:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt11_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt11:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt11_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt11:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt11_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt11:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt11_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt11:en-es","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt11_en-es","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt11:en-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt11_en-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt11:es-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt11_es-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt11:fr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt11_fr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt12:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt12_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt12:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt12_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt12:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt12_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt12:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt12_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt12:en-es","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt12_en-es","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt12:en-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt12_en-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt12:es-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt12_es-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt12:fr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt12_fr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt13:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt13_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt13:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt13_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt13:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt13_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt13:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt13_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt13:en-es","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt13_en-es","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt13:en-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt13_en-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt13:en-ru","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt13_en-ru","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt13:es-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt13_es-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt13:fr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt13_fr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt13:ru-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt13_ru-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt14:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt14_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt14:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt14_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt14:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt14_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt14:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt14_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt14:en-fr","suite":["lighteval","gpt3_benchmarks"],"prompt_function":"wmt_alphabetical","hf_repo":"wmt14","hf_subset":"fr-en","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt14:en-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt14_en-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt14:en-hi","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt14_en-hi","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt14:en-ru","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt14_en-ru","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt14:fr-en","suite":["lighteval","gpt3_benchmarks"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"wmt14","hf_subset":"fr-en","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt14:fr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt14_fr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt14:hi-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt14_hi-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt14:ru-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt14_ru-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt14:cs-en","suite":["helm"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/wmt14","hf_subset":"cs-en","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["bleu"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt14:de-en","suite":["helm"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/wmt14","hf_subset":"de-en","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["bleu"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt14:fr-en","suite":["helm"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/wmt14","hf_subset":"fr-en","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["bleu"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt14:hi-en","suite":["helm"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/wmt14","hf_subset":"hi-en","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["bleu"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt14:ru-en","suite":["helm"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/wmt14","hf_subset":"ru-en","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["bleu"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt15:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt15_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt15:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt15_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt15:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt15_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt15:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt15_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt15:en-fi","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt15_en-fi","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt15:en-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt15_en-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt15:en-ru","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt15_en-ru","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt15:fi-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt15_fi-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt15:fr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt15_fr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt15:ru-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt15_ru-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt16:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt16:de-en","suite":["lighteval","gpt3_benchmarks"],"prompt_function":"wmt_alphabetical","hf_repo":"wmt16","hf_subset":"de-en","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt16:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt16:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt16:en-de","suite":["lighteval","gpt3_benchmarks"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"wmt16","hf_subset":"de-en","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt16:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt16:en-fi","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_en-fi","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt16:en-ro","suite":["lighteval","gpt3_benchmarks"],"prompt_function":"wmt_alphabetical","hf_repo":"wmt16","hf_subset":"ro-en","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt16:en-ro","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_en-ro","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt16:en-ru","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_en-ru","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt16:en-tr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_en-tr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt16:fi-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_fi-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt16:ro-en","suite":["lighteval","gpt3_benchmarks"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"wmt16","hf_subset":"ro-en","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt16:ro-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_ro-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt16:ru-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_ru-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt16:tr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_tr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt17:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt17:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt17:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt17:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt17:en-fi","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_en-fi","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt17:en-lv","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_en-lv","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt17:en-ru","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_en-ru","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt17:en-tr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_en-tr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt17:en-zh","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_en-zh","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt17:fi-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_fi-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt17:lv-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_lv-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt17:ru-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_ru-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt17:tr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_tr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt17:zh-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_zh-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt18:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt18:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt18:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt18:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt18:en-et","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_en-et","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt18:en-fi","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_en-fi","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt18:en-ru","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_en-ru","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt18:en-tr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_en-tr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt18:en-zh","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_en-zh","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt18:et-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_et-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt18:fi-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_fi-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt18:ru-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_ru-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt18:tr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_tr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt18:zh-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_zh-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt19:cs-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_cs-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt19:de-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_de-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt19:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt19:de-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_de-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt19:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt19:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt19:en-fi","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_en-fi","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt19:en-gu","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_en-gu","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt19:en-kk","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_en-kk","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt19:en-lt","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_en-lt","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt19:en-ru","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_en-ru","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt19:en-zh","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_en-zh","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt19:fi-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_fi-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt19:fr-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_fr-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt19:gu-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_gu-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt19:kk-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_kk-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt19:lt-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_lt-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt19:ru-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_ru-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt19:zh-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_zh-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt20:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt20:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt20:de-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_de-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt20:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt20:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt20:en-iu","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_en-iu","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt20:en-ja","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_en-ja","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt20:en-km","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_en-km","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt20:en-pl","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_en-pl","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt20:en-ps","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_en-ps","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt20:en-ru","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_en-ru","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt20:en-ta","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_en-ta","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt20:en-zh","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_en-zh","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt20:fr-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_fr-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt20:iu-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_iu-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt20:ja-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_ja-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt20:km-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_km-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt20:pl-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_pl-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt20:ps-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_ps-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt20:ru-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_ru-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt20:ta-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_ta-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wmt20:zh-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_zh-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"word_sorting","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"word_sorting","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"word_unscrambling","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"word_unscrambling","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"wsc273","suite":["lighteval"],"prompt_function":"wsc273","hf_repo":"winograd_wsc","hf_subset":"wsc273","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"xcopa:en","suite":["lighteval"],"prompt_function":"xcopa_en","hf_repo":"xcopa","hf_subset":"default","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"xcopa:et","suite":["lighteval"],"prompt_function":"xcopa_et","hf_repo":"xcopa","hf_subset":"et","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"xcopa:ht","suite":["lighteval"],"prompt_function":"xcopa_ht","hf_repo":"xcopa","hf_subset":"ht","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"xcopa:it","suite":["lighteval"],"prompt_function":"xcopa_it","hf_repo":"xcopa","hf_subset":"it","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"xcopa:id","suite":["lighteval"],"prompt_function":"xcopa_id","hf_repo":"xcopa","hf_subset":"id","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"xcopa:qu","suite":["lighteval"],"prompt_function":"xcopa_qu","hf_repo":"xcopa","hf_subset":"qu","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"xcopa:sw","suite":["lighteval"],"prompt_function":"xcopa_sw","hf_repo":"xcopa","hf_subset":"sw","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"xcopa:zh","suite":["lighteval"],"prompt_function":"xcopa_zh","hf_repo":"xcopa","hf_subset":"zh","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"xcopa:ta","suite":["lighteval"],"prompt_function":"xcopa_ta","hf_repo":"xcopa","hf_subset":"ta","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"xcopa:th","suite":["lighteval"],"prompt_function":"xcopa_th","hf_repo":"xcopa","hf_subset":"th","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"xcopa:tr","suite":["lighteval"],"prompt_function":"xcopa_tr","hf_repo":"xcopa","hf_subset":"tr","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"xcopa:vi","suite":["lighteval"],"prompt_function":"xcopa_vi","hf_repo":"xcopa","hf_subset":"vi","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"xstory_cloze:en","suite":["lighteval"],"prompt_function":"storycloze","hf_repo":"juletxara/xstory_cloze","hf_subset":"en","hf_avail_splits":["training", "eval"],"evaluation_splits":["eval"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"xstory_cloze:ru","suite":["lighteval"],"prompt_function":"storycloze","hf_repo":"juletxara/xstory_cloze","hf_subset":"ru","hf_avail_splits":["training", "eval"],"evaluation_splits":["eval"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"xstory_cloze:zh","suite":["lighteval"],"prompt_function":"storycloze","hf_repo":"juletxara/xstory_cloze","hf_subset":"zh","hf_avail_splits":["training", "eval"],"evaluation_splits":["eval"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"xstory_cloze:es","suite":["lighteval"],"prompt_function":"storycloze","hf_repo":"juletxara/xstory_cloze","hf_subset":"es","hf_avail_splits":["training", "eval"],"evaluation_splits":["eval"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"xstory_cloze:ar","suite":["lighteval"],"prompt_function":"storycloze","hf_repo":"juletxara/xstory_cloze","hf_subset":"ar","hf_avail_splits":["training", "eval"],"evaluation_splits":["eval"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"xstory_cloze:hi","suite":["lighteval"],"prompt_function":"storycloze","hf_repo":"juletxara/xstory_cloze","hf_subset":"hi","hf_avail_splits":["training", "eval"],"evaluation_splits":["eval"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"xstory_cloze:id","suite":["lighteval"],"prompt_function":"storycloze","hf_repo":"juletxara/xstory_cloze","hf_subset":"id","hf_avail_splits":["training", "eval"],"evaluation_splits":["eval"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"xstory_cloze:te","suite":["lighteval"],"prompt_function":"storycloze","hf_repo":"juletxara/xstory_cloze","hf_subset":"te","hf_avail_splits":["training", "eval"],"evaluation_splits":["eval"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"xstory_cloze:sw","suite":["lighteval"],"prompt_function":"storycloze","hf_repo":"juletxara/xstory_cloze","hf_subset":"sw","hf_avail_splits":["training", "eval"],"evaluation_splits":["eval"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"xstory_cloze:eu","suite":["lighteval"],"prompt_function":"storycloze","hf_repo":"juletxara/xstory_cloze","hf_subset":"eu","hf_avail_splits":["training", "eval"],"evaluation_splits":["eval"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"xstory_cloze:my","suite":["lighteval"],"prompt_function":"storycloze","hf_repo":"juletxara/xstory_cloze","hf_subset":"my","hf_avail_splits":["training", "eval"],"evaluation_splits":["eval"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"xwinograd:en","suite":["lighteval"],"prompt_function":"winogrande","hf_repo":"Muennighoff/xwinograd","hf_subset":"en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"xwinograd:fr","suite":["lighteval"],"prompt_function":"winogrande","hf_repo":"Muennighoff/xwinograd","hf_subset":"fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"xwinograd:jp","suite":["lighteval"],"prompt_function":"winogrande","hf_repo":"Muennighoff/xwinograd","hf_subset":"jp","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"xwinograd:pt","suite":["lighteval"],"prompt_function":"winogrande","hf_repo":"Muennighoff/xwinograd","hf_subset":"pt","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"xwinograd:ru","suite":["lighteval"],"prompt_function":"winogrande","hf_repo":"Muennighoff/xwinograd","hf_subset":"ru","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"xwinograd:zh","suite":["lighteval"],"prompt_function":"winogrande","hf_repo":"Muennighoff/xwinograd","hf_subset":"zh","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} From 9e3078eb14117da4b8c7508738177d21403d11e8 Mon Sep 17 00:00:00 2001 From: Nathan Habib <30601243+NathanHB@users.noreply.github.com> Date: Fri, 3 May 2024 15:07:04 +0200 Subject: [PATCH 08/25] adding aimo custom eval (#154) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --------- Co-authored-by: Nathan Habib Co-authored-by: lewtun Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> --- community_tasks/aimo_evals.py | 72 +++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 community_tasks/aimo_evals.py diff --git a/community_tasks/aimo_evals.py b/community_tasks/aimo_evals.py new file mode 100644 index 00000000..05704ca4 --- /dev/null +++ b/community_tasks/aimo_evals.py @@ -0,0 +1,72 @@ +# MIT License + +# Copyright (c) 2024 The HuggingFace Team +# Copyright (c) 2024 Philip May, Deutsche Telekom AG + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# ruff: noqa: F405, F403, F401 +""" +Custom evaluation tasks for lighteval. + +This file generally create just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval. +This module implements the ... +""" + +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.requests import Doc + + +task = LightevalTaskConfig( + name="aimo_progress_prize_1", + prompt_function="prompt", + suite=["community"], + hf_subset="", + hf_repo="lighteval/aimo_progress_prize_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split="train", + few_shots_select="sequential", + metric=["quasi_exact_match_math"], + generation_size=2048, + stop_sequence=None, +) + + +def aimo_prompt(line, task_name: str = None): + return Doc( + task_name=task_name, + choices=[str(line["answer"])], + gold_index=0, + query=line["problem"], + ) + + +# STORE YOUR EVALS +_TASKS = [task] + + +# MODULE LOGIC +# You should not need to touch this +# Convert to dict for lighteval +TASKS_TABLE = [task.as_dict() for task in _TASKS] + +if __name__ == "__main__": + print(t["name"] for t in TASKS_TABLE) + print(len(TASKS_TABLE)) From 60438f3cf2670969e5f8306967db51831f27eb6c Mon Sep 17 00:00:00 2001 From: lewtun Date: Sat, 4 May 2024 13:05:12 +0200 Subject: [PATCH 09/25] [MATH] Fix generation for chat models & fix normalization for predictions (#163) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fix _fix_fracs in MATH normalization * Fix generation for chat models * Use same normaliser for predictions and golds * better stop token managment when using chat templates * fix test, separate math and math cot --------- Co-authored-by: Nathan Habib Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> Co-authored-by: clementine@huggingface.co --- src/lighteval/metrics/metrics.py | 5 +- src/lighteval/metrics/normalizations.py | 54 +++++++++++++------ src/lighteval/models/base_model.py | 20 +++++-- src/lighteval/models/model_config.py | 3 ++ .../tasks/tasks_prompt_formatting.py | 9 ++++ src/lighteval/tasks/tasks_table.jsonl | 21 +++++--- tests/reference_scores/harness_metrics.json | 2 +- 7 files changed, 81 insertions(+), 33 deletions(-) diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py index 07d5c918..f7eaedba 100644 --- a/src/lighteval/metrics/metrics.py +++ b/src/lighteval/metrics/metrics.py @@ -54,7 +54,6 @@ harness_triviaqa_normalizer, helm_normalizer, math_normalizer, - math_normalizer_gold, remove_braces, remove_braces_and_strip, ) @@ -330,7 +329,7 @@ class Metrics(Enum): maj_at_4_math = SampleLevelMetric( metric="maj@4", sample_level_fn=MajAtK( - k=4, strip_strings=True, normalize_pred=math_normalizer, normalize_gold=math_normalizer_gold + k=4, strip_strings=True, normalize_pred=math_normalizer, normalize_gold=math_normalizer ).compute, category=MetricCategory.GENERATIVE_SAMPLING, use_case=MetricUseCase.MATH, @@ -438,7 +437,7 @@ class Metrics(Enum): quasi_exact_match_math = SampleLevelMetric( metric="qem", sample_level_fn=ExactMatches( - strip_strings=True, normalize_pred=math_normalizer, normalize_gold=math_normalizer_gold + strip_strings=True, normalize_pred=math_normalizer, normalize_gold=math_normalizer ).compute, category=MetricCategory.GENERATIVE, use_case=MetricUseCase.MATH, diff --git a/src/lighteval/metrics/normalizations.py b/src/lighteval/metrics/normalizations.py index b5b32d3e..d3f33d84 100644 --- a/src/lighteval/metrics/normalizations.py +++ b/src/lighteval/metrics/normalizations.py @@ -85,16 +85,18 @@ def remove_braces_and_strip(text: str) -> str: return text -def math_normalizer(text: str, is_gold: bool = False) -> str: # noqa C901 +def math_normalizer(text: str) -> str: # noqa C901 """Source: https://github.com/hendrycks/math""" - def _remove_boxed(text: str) -> str: + def _remove_boxed(text: str | None) -> str: """ Extract the text within a \\boxed{...} environment. Example: >>> _remove_boxed(\\boxed{\\frac{2}{3}}) \\frac{2}{3} """ + if text is None: + return "" if "\\boxed " in text: left = "\\boxed " assert text[: len(left)] == left @@ -205,14 +207,41 @@ def _fix_a_slash_b(text: str) -> str: return text def _remove_right_units(text: str) -> str: - """Source: https://github.com/hendrycks/math - Remove units (on the right). - "\\text{ " only ever occurs (at least in the val set) when describing units. """ + Removes unit descriptions from LaTeX-formatted text, where units are indicated by "\\text{ }". + This function splits the text at each "\\text{ " and returns the part before the first occurrence, + effectively discarding any units and additional text following this pattern. This function also + trims any trailing whitespace left after removing units. + + Args: + text (str): The input string potentially containing LaTeX-style unit descriptions. + + Returns: + str: The text with unit descriptions removed. + + Examples: + - Input: '50.5 \\text{ kg}' + Output: '50.5' + + - Input: 'The mass is 20 grams' + Output: 'The mass is 20 grams' + + - Input: 'The object weighs 30.2 \\text{ lbs} and is 15 \\text{ inches} long' + Output: 'The object weighs 30.2' + + - Input: '\\text{ unit without preceding text}' + Output: '' + + Note: + This function assumes that "\\text{ " is only used to denote units. It will remove all text + following the first occurrence of "\\text{ ", including any further text and units that might + appear in complex sentences. + """ + # Check for "\\text{ " and split the text at each occurrence if "\\text{ " in text: splits = text.split("\\text{ ") - assert len(splits) == 2 - return splits[0] + # Return only the first part which is assumed to contain the main content without units + return splits[0].rstrip() else: return text @@ -236,12 +265,7 @@ def _fix_sqrt(text: str) -> str: new_string += new_substr return new_string - if is_gold: - text = _remove_boxed(_last_boxed_only_string(text)) - else: - indices = [pos for pos, char in enumerate(text) if char == "$"] - if len(indices) > 1: - text = text[indices[0] + 1 : indices[-1]] + text = _remove_boxed(_last_boxed_only_string(text)) to_replace_1 = [ ("\n", ""), # linebreaks @@ -304,10 +328,6 @@ def _fix_sqrt(text: str) -> str: return text -def math_normalizer_gold(text: str) -> str: - return math_normalizer(text, True) - - def gsm8k_normalizer(text: str) -> str: """ from https://github.com/openai/grade-school-math/blob/3101c7d5072418e28b9008a6636bde82a006892c/grade_school_math/dataset.py#L28 diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py index 7d9bd8d2..3913fd80 100644 --- a/src/lighteval/models/base_model.py +++ b/src/lighteval/models/base_model.py @@ -74,6 +74,7 @@ def __init__( self.accelerator = config.accelerator self._batch_size = config.batch_size self._max_length = self._init_max_length(config.max_length) + self.use_chat_template = config.use_chat_template self._add_special_tokens = config.add_special_tokens if config.add_special_tokens is not None else False self._tokenizer = self._create_auto_tokenizer(config, env_config) @@ -346,7 +347,11 @@ def greedy_until_multi_turn( # noqa: C901 dataloader, desc="Greedy Multi Turn generation", position=1, leave=False, disable=self.disable_tqdm ): request = request_batch[0] - stop_tokens = request.stop_sequence + # For chat models, generation stops with EOS token, so we don't need to specify stop tokens + if self.use_chat_template: + stop_tokens = [] + else: + stop_tokens = request.stop_sequence max_generated_tokens = request.generation_size context = request.context[0] max_context_size_allowed = self.max_length - max_generated_tokens @@ -512,10 +517,15 @@ def greedy_until( for batch in tqdm( dataloader, desc="Greedy generation", position=1, leave=False, disable=self.disable_tqdm ): - # NOTE: we are assuming all items in a batch behave similarly (same - # stop_tokens and max_tokens generated) which is not necessarily - # the case! Because of that we only use batch size of 1 - stop_tokens = batch[0].stop_sequence + # For chat models, generation stops with EOS token, so we don't need to specify stop tokens + if self.use_chat_template: + stop_tokens = [] + else: + # NOTE: we are assuming all items in a batch behave similarly (same + # stop_tokens and max_tokens genrated) which is not necessarily + # the case! Because of that we only use batch size of 1 + stop_tokens = batch[0].stop_sequence + max_new_tokens = batch[0].generation_size returns_logits = batch[0].use_logits num_samples = batch[0].num_samples diff --git a/src/lighteval/models/model_config.py b/src/lighteval/models/model_config.py index 55ac1045..d62a85d3 100644 --- a/src/lighteval/models/model_config.py +++ b/src/lighteval/models/model_config.py @@ -122,6 +122,7 @@ class BaseModelConfig: device: Union[int, str] = "cuda" quantization_config: Optional[BitsAndBytesConfig] = None trust_remote_code: bool = False + use_chat_template: bool = False def __post_init__(self): if self.quantization_config is not None and not is_bnb_available(): @@ -266,6 +267,7 @@ def create_model_config(args: Namespace, accelerator: Union["Accelerator", None] if args.model_args: args_dict = {k.split("=")[0]: k.split("=")[1] for k in args.model_args.split(",")} args_dict["accelerator"] = accelerator + args_dict["use_chat_template"] = args.use_chat_template return BaseModelConfig(**args_dict) @@ -333,6 +335,7 @@ def create_model_config(args: Namespace, accelerator: Union["Accelerator", None] args_dict["quantization_config"] = quantization_config args_dict["batch_size"] = args.override_batch_size args_dict["multichoice_continuations_start_space"] = multichoice_continuations_start_space + args_dict["use_chat_template"] = args.use_chat_template # Keeping only non null params args_dict = {k: v for k, v in args_dict.items() if v is not None} diff --git a/src/lighteval/tasks/tasks_prompt_formatting.py b/src/lighteval/tasks/tasks_prompt_formatting.py index 2092bd24..80ce5588 100644 --- a/src/lighteval/tasks/tasks_prompt_formatting.py +++ b/src/lighteval/tasks/tasks_prompt_formatting.py @@ -1239,6 +1239,15 @@ def math(line, task_name: str = None): ) +def math_cot(line, task_name: str = None): + return Doc( + task_name=task_name, + query=f"{line['problem']}\nPlease reason step by step, and put your final answer within \\boxed{{}}.", + gold_index=0, + choices=[f" {line['solution']}"], + ) + + def math_helm(line, task_name: str = None): return Doc( task_name=task_name, diff --git a/src/lighteval/tasks/tasks_table.jsonl b/src/lighteval/tasks/tasks_table.jsonl index 83cbbbd1..0047ad5d 100644 --- a/src/lighteval/tasks/tasks_table.jsonl +++ b/src/lighteval/tasks/tasks_table.jsonl @@ -540,13 +540,20 @@ {"name":"lsat_qa:grouping","suite":["helm","lsat_qa_scenario"],"prompt_function":"lsat_qa","hf_repo":"lighteval\/lsat_qa","hf_subset":"grouping","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} {"name":"lsat_qa:miscellaneous","suite":["helm","lsat_qa_scenario"],"prompt_function":"lsat_qa","hf_repo":"lighteval\/lsat_qa","hf_subset":"miscellaneous","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} {"name":"lsat_qa:ordering","suite":["helm","lsat_qa_scenario"],"prompt_function":"lsat_qa","hf_repo":"lighteval\/lsat_qa","hf_subset":"ordering","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"math:algebra","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"algebra","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"math:counting_and_probability","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"counting_and_probability","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"math:geometry","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"geometry","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"math:intermediate_algebra","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"intermediate_algebra","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"math:number_theory","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"number_theory","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"math:prealgebra","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"prealgebra","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"math:precalculus","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"precalculus","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"math:algebra","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"algebra","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":1} +{"name":"math:counting_and_probability","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"counting_and_probability","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":1} +{"name":"math:geometry","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"geometry","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":1} +{"name":"math:intermediate_algebra","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"intermediate_algebra","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":1} +{"name":"math:number_theory","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"number_theory","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":1} +{"name":"math:prealgebra","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"prealgebra","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":1} +{"name":"math:precalculus","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"precalculus","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":1} +{"name":"math_cot:algebra","suite":["lighteval","math"],"prompt_function":"math_cot","hf_repo":"lighteval\/MATH","hf_subset":"algebra","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"math_cot:counting_and_probability","suite":["lighteval","math"],"prompt_function":"math_cot","hf_repo":"lighteval\/MATH","hf_subset":"counting_and_probability","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"math_cot:geometry","suite":["lighteval","math"],"prompt_function":"math_cot","hf_repo":"lighteval\/MATH","hf_subset":"geometry","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"math_cot:intermediate_algebra","suite":["lighteval","math"],"prompt_function":"math_cot","hf_repo":"lighteval\/MATH","hf_subset":"intermediate_algebra","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"math_cot:number_theory","suite":["lighteval","math"],"prompt_function":"math_cot","hf_repo":"lighteval\/MATH","hf_subset":"number_theory","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"math_cot:prealgebra","suite":["lighteval","math"],"prompt_function":"math_cot","hf_repo":"lighteval\/MATH","hf_subset":"prealgebra","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} +{"name":"math_cot:precalculus","suite":["lighteval","math"],"prompt_function":"math_cot","hf_repo":"lighteval\/MATH","hf_subset":"precalculus","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} {"name":"mathematical_induction","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"mathematical_induction","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} {"name":"mathqa","suite":["lighteval"],"prompt_function":"mathqa","hf_repo":"math_qa","hf_subset":"default","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} {"name":"matrixshapes","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"matrixshapes","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} diff --git a/tests/reference_scores/harness_metrics.json b/tests/reference_scores/harness_metrics.json index 1c8c5b91..9679e659 100644 --- a/tests/reference_scores/harness_metrics.json +++ b/tests/reference_scores/harness_metrics.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:408956938a6b7a18b03658bb9772b471efcea4aa04afb0b35d76cecfca6a706e +oid sha256:e5dffe1e990e1e839322b74ff02f306ea468ad7602492f62f987cae1bb546b84 size 48376580 From 981e10ac233b2940cac5a4aabdbba36537986c61 Mon Sep 17 00:00:00 2001 From: lewtun Date: Tue, 7 May 2024 14:23:29 +0200 Subject: [PATCH 10/25] Fix AIMO (#186) Small bug fix and added a doc string --- community_tasks/aimo_evals.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/community_tasks/aimo_evals.py b/community_tasks/aimo_evals.py index 05704ca4..556ae663 100644 --- a/community_tasks/aimo_evals.py +++ b/community_tasks/aimo_evals.py @@ -1,7 +1,6 @@ # MIT License # Copyright (c) 2024 The HuggingFace Team -# Copyright (c) 2024 Philip May, Deutsche Telekom AG # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -23,10 +22,7 @@ # ruff: noqa: F405, F403, F401 """ -Custom evaluation tasks for lighteval. - -This file generally create just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval. -This module implements the ... +Task to evaluate LLMs on the training set of the Kaggle AIMO competition: https://www.kaggle.com/competitions/ai-mathematical-olympiad-prize """ from lighteval.tasks.lighteval_task import LightevalTaskConfig @@ -35,7 +31,7 @@ task = LightevalTaskConfig( name="aimo_progress_prize_1", - prompt_function="prompt", + prompt_function="aimo_prompt", suite=["community"], hf_subset="", hf_repo="lighteval/aimo_progress_prize_1", From 061283f50ef89624445ebe51eb8f9824593c2341 Mon Sep 17 00:00:00 2001 From: Sadra Barikbin Date: Sun, 12 May 2024 14:04:32 +0330 Subject: [PATCH 11/25] Fix a few comment and docstring typos and a typehint (#177) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --------- Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com> Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> --- community_tasks/_template.py | 2 +- community_tasks/arabic_evals.py | 2 +- community_tasks/german_rag_evals.py | 2 +- examples/nanotron/custom_evaluation_tasks.py | 2 +- src/lighteval/metrics/metrics_sample.py | 14 +++++++------- src/lighteval/metrics/sample_preparator.py | 2 +- src/lighteval/models/abstract_model.py | 2 +- src/lighteval/models/model_config.py | 2 +- src/lighteval/tasks/requests.py | 2 +- 9 files changed, 15 insertions(+), 15 deletions(-) diff --git a/community_tasks/_template.py b/community_tasks/_template.py index 93934648..6b52f9f4 100644 --- a/community_tasks/_template.py +++ b/community_tasks/_template.py @@ -40,7 +40,7 @@ # EVAL WITH NO SUBSET ## -# This is how you create a simple tasks (like hellaswag) which has one single subset +# This is how you create a simple task (like hellaswag) which has one single subset # attached to it, and one evaluation possible. task = LightevalTaskConfig( name="myothertask", diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py index 6a0af9f5..9e65bade 100644 --- a/community_tasks/arabic_evals.py +++ b/community_tasks/arabic_evals.py @@ -24,7 +24,7 @@ """ Custom evaluation tasks for lighteval -This file generally create just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval. +This file generally creates just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval. """ import random import re diff --git a/community_tasks/german_rag_evals.py b/community_tasks/german_rag_evals.py index 687eda0c..fdda9d7a 100644 --- a/community_tasks/german_rag_evals.py +++ b/community_tasks/german_rag_evals.py @@ -25,7 +25,7 @@ """ Custom evaluation tasks for lighteval. -This file generally create just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval. +This file generally creates just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval. This module implements the 4 tasks of deutsche-telekom/Ger-RAG-eval. See: https://huggingface.co/datasets/deutsche-telekom/Ger-RAG-eval """ diff --git a/examples/nanotron/custom_evaluation_tasks.py b/examples/nanotron/custom_evaluation_tasks.py index 4c0f3c85..cdca8385 100644 --- a/examples/nanotron/custom_evaluation_tasks.py +++ b/examples/nanotron/custom_evaluation_tasks.py @@ -24,7 +24,7 @@ """ Custom evaluation tasks for lighteval -This file generally create just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval. +This file generally creates just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval. """ import re from dataclasses import asdict diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index 37b922da..6e983247 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -213,21 +213,21 @@ def __init__(self, length_normalization: bool = False, ignore_first_space: bool length_normalization (bool, optional): Whether log-likelihood scores should be normalized for sentence length. Defaults to False. Should be True for most cases. ignore_first_space (bool, optional): Whether to ignore the first token's log prob (if it's a space only). Defaults to False. - Only case when it should be True is when the possible choices (for example `A`,`B` ...) have an extra + The only case when it should be True is when the possible choices (for example `A`,`B` ...) have an extra space added in front of them to manage tokenization issues (` A`, ` B`, ...) for some models. """ self.length_normalization = length_normalization self.ignore_first_space = ignore_first_space def compute(self, gold_ixs: list[int], choices_logprob: list[float], formatted_doc: Doc, **kwargs) -> int: - """Computs the log likelihood accuracy: is the choice with the highest logprob in `choices_logprob` present - in the `gold_idxs`? + """Computes the log likelihood accuracy: is the choice with the highest logprob in `choices_logprob` present + in the `gold_ixs`? Args: gold_ixs (list[int]): All the gold choices indices choices_logprob (list[float]): Summed log-probabilities of all the possible choices for the model, ordered as the choices. formatted_doc (Doc): Original document for the sample. - Used to get the original choices's length for possible normalisation + Used to get the original choices' length for possible normalization Returns: int: The eval score: 1 if the best log-prob choice is in gold, 0 otherwise. @@ -258,7 +258,7 @@ def __init__(self, at: int) -> None: def compute(self, choices_logprob: list[float], gold_ixs: list[int], **kwargs) -> int: """Computes the recall at the requested depth level: looks at the `n` best predicted choices (with the - highest log probabilies) and see if there is an actual gold among them. + highest log probabilities) and see if there is an actual gold among them. Args: gold_ixs (list[int]): All the gold choices indices @@ -277,7 +277,7 @@ def __init__(self, length_normalization: bool = False): """A mean reciprocal rank class. Args: - length_normalization (bool, optional): Whether to use normalisation be choice length when computing the best log-probabilities. Defaults to False. + length_normalization (bool, optional): Whether to use normalization on choice length when computing the best log-probabilities. Defaults to False. """ self.length_normalization = length_normalization @@ -288,7 +288,7 @@ def compute(self, choices_logprob: list[float], gold_ixs: list[float], formatted gold_ixs (list[int]): All the gold choices indices choices_logprob (list[float]): Summed log-probabilities of all the possible choices for the model, ordered as the choices. formatted_doc (Doc): Original document for the sample. - Used to get the original choices's length for possible normalisation + Used to get the original choices' length for possible normalization Returns: float: MRR score. diff --git a/src/lighteval/metrics/sample_preparator.py b/src/lighteval/metrics/sample_preparator.py index 0df23ec3..a7d7d0c0 100644 --- a/src/lighteval/metrics/sample_preparator.py +++ b/src/lighteval/metrics/sample_preparator.py @@ -132,7 +132,7 @@ def prepare(self, logprobs: list[float] | float, reference_text: str, **kwargs): """Prepares an individual perplexity example to the format expected by metrics computed at the corpus level (aggregated). Args: - logprobs (list[float]): List of the logprobabilities computed for each item of the sequence or single aggregated logprob over the sequence + logprobs (list[float]): List of the log-probabilities computed for each item of the sequence or single aggregated logprob over the sequence reference_text (str): Current reference text for which to compute the length in self.units_type Returns: diff --git a/src/lighteval/models/abstract_model.py b/src/lighteval/models/abstract_model.py index 754a6144..b9111c31 100644 --- a/src/lighteval/models/abstract_model.py +++ b/src/lighteval/models/abstract_model.py @@ -118,7 +118,7 @@ def loglikelihood( @abstractmethod def loglikelihood_rolling( - self, requests: list[LoglikelihoodRollingRequest], override_bs=None + self, requests: list[LoglikelihoodRollingRequest], override_bs: Optional[int] = None ) -> list[LoglikelihoodReturn]: """This function is used to compute the log likelihood of the context for perplexity metrics.""" return NotImplemented diff --git a/src/lighteval/models/model_config.py b/src/lighteval/models/model_config.py index d62a85d3..5cb7c89d 100644 --- a/src/lighteval/models/model_config.py +++ b/src/lighteval/models/model_config.py @@ -349,7 +349,7 @@ def create_model_config(args: Namespace, accelerator: Union["Accelerator", None] raise ValueError("You need to specify a base model when using adapter weights") return AdapterModelConfig(**args_dict) if config["merged_weights"]["base_model"] not in ["", None]: - raise ValueError("You can't specifify a base model if you are not using delta/adapter weights") + raise ValueError("You can't specify a base model if you are not using delta/adapter weights") return BaseModelConfig(**args_dict) raise ValueError(f"Unknown model type in your model config file: {config['type']}") diff --git a/src/lighteval/tasks/requests.py b/src/lighteval/tasks/requests.py index 51abf61d..283e6959 100644 --- a/src/lighteval/tasks/requests.py +++ b/src/lighteval/tasks/requests.py @@ -155,7 +155,7 @@ class TaskExampleId(NamedTuple): class Doc: """ Dataclass used to represent the content of a task example - almost every field is optional, but some tasks require some fields to be present + almost every field is optional, but some tasks require some fields to be present. When adding a new task, please add the required fields to the doc class. Each task will have a different set of fields needed. """ From ad42e43bcc3bd50fdba68936999bf553bf53b9e4 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Mon, 13 May 2024 18:02:14 +0200 Subject: [PATCH 12/25] fix typos (#189) --- README.md | 54 +++++++++---------- src/lighteval/metrics/metrics_sample.py | 7 +-- .../extended/mt_bench/judge_prompts.jsonl | 2 +- 3 files changed, 31 insertions(+), 32 deletions(-) diff --git a/README.md b/README.md index fe9748c3..6b116fc7 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ LightEval is a lightweight LLM evaluation suite that Hugging Face has been using We're releasing it with the community in the spirit of building in the open. Note that it is still very much early so don't expect 100% stability ^^' -In case of problems or question, feel free to open an issue! +In case of problems or questions, feel free to open an issue! ## Installation @@ -130,7 +130,7 @@ See the [`examples/tasks/recommended_set.txt`](./examples/tasks/recommended_set. ### Evaluating a model with a complex configuration -If you want to evaluate a model by spinning up inference endpoints, or use adapter/delta weights, or more complex configuration options, you can load models using a configuration file. This is done as follows: +If you want to evaluate a model by spinning up inference endpoints, use adapter/delta weights, or more complex configuration options, you can load models using a configuration file. This is done as follows: ```shell accelerate launch --multi_gpu --num_processes= run_evals_accelerate.py \ @@ -186,12 +186,12 @@ python run_evals_accelerate.py \ Independently of the default tasks provided in `lighteval` that you will find in the `tasks_table.jsonl` file, you can use `lighteval` to evaluate models on tasks that require special processing (or have been added by the community). These tasks have their own evaluation suites and are defined as follows: -* `extended`: tasks which have complex pre- or post-processing and are added by the `lighteval` maintainers. See the [`extended`](./src/lighteval/tasks/extended) folder for examples. -* `community`: tasks which have been added by the community. See the [`community_tasks`](./community_tasks) folder for examples. -* `custom`: tasks which are defined locally and not present in the core library. Use this suite if you want to experiment with designing a special metric or task. +* `extended`: tasks that have complex pre- or post-processing and are added by the `lighteval` maintainers. See the [`extended`](./src/lighteval/tasks/extended) folder for examples. +* `community`: tasks that have been added by the community. See the [`community_tasks`](./community_tasks) folder for examples. +* `custom`: tasks that are defined locally and not present in the core library. Use this suite if you want to experiment with designing a special metric or task. -For example, to run an extended task like ifeval, you can run: +For example, to run an extended task like `ifeval`, you can run: ```shell python run_evals_accelerate.py \ --model_args "pretrained=HuggingFaceH4/zephyr-7b-beta" \ @@ -221,7 +221,6 @@ python run_evals_accelerate.py \ --output_dir "./evals" ``` - ## Deep thanks `lighteval` was originally built on top of the great [Eleuther AI Harness](https://github.com/EleutherAI/lm-evaluation-harness) (we use the latter to power the [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)). We also took a lot of inspiration from the amazing [HELM](https://crfm.stanford.edu/helm/latest/), notably for metrics. @@ -236,30 +235,30 @@ However, we are very grateful to the Harness and HELM teams for their continued - [lighteval](https://github.com/huggingface/lighteval/tree/main/src/lighteval) contains the core of the library, divided in the following section - [main_accelerate.py](https://github.com/huggingface/lighteval/blob/main/src/lighteval/main_accelerate.py) and [main_nanotron.py](https://github.com/huggingface/lighteval/blob/main/src/lighteval/main_nanotron.py) are our entry points to run evaluation - [logging](https://github.com/huggingface/lighteval/tree/main/src/lighteval/logging): Our loggers, to display experiment information and push it to the hub after a run - - [metrics](https://github.com/huggingface/lighteval/tree/main/src/lighteval/metrics): All the available metrics you can use. They are described in metrics, and divided between sample metrics (applied at the sample level, such as a prediction accuracy) and corpus metrics (applied over the whole corpus). You'll also find available normalisation functions. + - [metrics](https://github.com/huggingface/lighteval/tree/main/src/lighteval/metrics): All the available metrics you can use. They are described in metrics, and divided between sample metrics (applied at the sample level, such as prediction accuracy) and corpus metrics (applied over the whole corpus). You'll also find available normalisation functions. - [models](https://github.com/huggingface/lighteval/tree/main/src/lighteval/models): Possible models to use. We cover transformers (base_model), with adapter or delta weights, as well as TGI models locally deployed (it's likely the code here is out of date though), and brrr/nanotron models. - [tasks](https://github.com/huggingface/lighteval/tree/main/src/lighteval/tasks): Available tasks. The complete list is in `tasks_table.jsonl`, and you'll find all the prompts in `tasks_prompt_formatting.py`. Popular tasks requiring custom logic are exceptionally added in the [extended tasks](https://github.com/huggingface/lighteval/blob/main/src/lighteval/tasks/extended). - [examples/tasks](https://github.com/huggingface/lighteval/tree/main/examples/tasks) contains a list of available tasks you can launch. We advise using tasks in the `recommended_set`, as it's possible that some of the other tasks need double checking. -- [tests](https://github.com/huggingface/lighteval/tree/main/tests) contains our test suite, that we run at each PR to prevent regressions in metrics/prompts/tasks, for a subset of important tasks. +- [tests](https://github.com/huggingface/lighteval/tree/main/tests) contains our test suite, which we run at each PR to prevent regressions in metrics/prompts/tasks, for a subset of important tasks. -## Customisation +## Customization If your new task or metric has requirements, add a specific `requirements.txt` file with your evaluation. ### Adding a new task -To add a new task, first either open an issue, to determine whether it will be integrated in the core evaluations of lighteval, in the extended tasks, or in the community tasks, and **add its dataset** on the hub. +To add a new task, first either open an issue, to determine whether it will be integrated in the core evaluations of lighteval, in the extended tasks, or the community tasks, and **add its dataset** on the hub. -- Core evaluations are evaluation which only require standard logic in their metrics and processing, and that we will add to our test suite to ensure non regression through time. They already see a high usage in the community. -- Extended evaluations are evaluations which require custom logic in their metrics (complex normalisation, an LLM as a judge, ...), that we added to facilitate the life of users. They already see a high usage in the community. +- Core evaluations are evaluations that only require standard logic in their metrics and processing, and that we will add to our test suite to ensure non regression through time. They already see high usage in the community. +- Extended evaluations are evaluations that require custom logic in their metrics (complex normalisation, an LLM as a judge, ...), that we added to facilitate the life of users. They already see high usage in the community. - Community evaluations are submissions by the community of new tasks. -A popular community evaluation can move to becoming an extended or core evaluation through time. +A popular community evaluation can move to become an extended or core evaluation over time. #### Core evaluations -Prompt function: **find a suitable prompt function** in `src.lighteval.tasks.task_prompt_formatting.py`, or code your own. This function must output a `Doc` object, which should contain `query`, your prompt, and either `gold`, the gold output, or `choices` and `gold_index`, the list of choices and index or indices of correct answers. If your query contains an instruction which should not be repeated in a few shot setup, add it to an `instruction` field. +Prompt function: **find a suitable prompt function** in `src.lighteval.tasks.task_prompt_formatting.py`, or code your own. This function must output a `Doc` object, which should contain the `query`, your prompt, and either `gold`, the gold output, or `choices` and `gold_index`, the list of choices and index or indices of correct answers. If your query contains an instruction that should not be repeated in a few shot setup, add it to an `instruction` field. Summary: create a **line summary** of your evaluation, in `src/lighteval/tasks/tasks_table.jsonl`. This summary should contain the following fields: - `name` (str), your evaluation name -- `suite` (list), the suite(s) to which your evaluation should belong. This field allows us to compare different tasks implementation, and is used a task selection to differentiate the versions to launch. At the moment, you'll find the keywords ["helm", "bigbench", "original", "lighteval", "community", "custom"]; for core evals, please choose `lighteval`. +- `suite` (list), the suite(s) to which your evaluation should belong. This field allows us to compare different task implementations and is used as a task selection to differentiate the versions to launch. At the moment, you'll find the keywords ["helm", "bigbench", "original", "lighteval", "community", "custom"]; for core evals, please choose `lighteval`. - `prompt_function` (str), the name of the prompt function you defined in the step above - `hf_repo` (str), the path to your evaluation dataset on the hub - `hf_subset` (str), the specific subset you want to use for your evaluation (note: when the dataset has no subset, fill this field with `"default"`, not with `None` or `""`) @@ -267,16 +266,16 @@ Summary: create a **line summary** of your evaluation, in `src/lighteval/tasks/t - `evaluation_splits` (list), the splits you want to use for evaluation - `few_shots_split` (str, can be `null`), the specific split from which you want to select samples for your few-shot examples. It should be different from the sets included in `evaluation_splits` - `few_shots_select` (str, can be `null`), the method that you will use to select items for your few-shot examples. Can be `null`, or one of: - - `balanced` selects examples from the `few_shots_split` with balanced labels, to avoid skewing the few shot examples (hence the model generations) towards one specific label + - `balanced` select examples from the `few_shots_split` with balanced labels, to avoid skewing the few shot examples (hence the model generations) toward one specific label - `random` selects examples at random from the `few_shots_split` - `random_sampling` selects new examples at random from the `few_shots_split` for every new item, but if a sampled item is equal to the current one, it is removed from the available samples - `random_sampling_from_train` selects new examples at random from the `few_shots_split` for every new item, but if a sampled item is equal to the current one, it is kept! Only use this if you know what you are doing. - - `sequential` selects the first `n` examples of the `few_shots_split` +`sequential` selects the first `n` examples of the `few_shots_split` - `generation_size` (int), the maximum number of tokens allowed for a generative evaluation. If your evaluation is a log likelihood evaluation (multi-choice), this value should be -1 - `stop_sequence` (list), a list of strings acting as end of sentence tokens for your generation - `metric` (list), the metrics you want to use for your evaluation (see next section for a detailed explanation) -- `output_regex` (str), A regex string that will be used to filter your generation. (Genrative metrics will only select tokens that are between the first and the second sequence matched by the regex. For example, for a regex matching `\n` and a generation `\nModel generation output\nSome other text` the metric will only be fed with `Model generation output`) -- `frozen` (bool), for now is set to False, but we will steadily pass all stable tasks to True. +- `output_regex` (str), A regex string that will be used to filter your generation. (Generative metrics will only select tokens that are between the first and the second sequence matched by the regex. For example, for a regex matching `\n` and a generation `\nModel generation output\nSome other text` the metric will only be fed with `Model generation output`) +- `frozen` (bool), for now, is set to False, but we will steadily pass all stable tasks to True. - `trust_dataset` (bool), set to True if you trust the dataset. Make sure you can launch your model with your new task using `--tasks lighteval|yournewtask|2|0`. @@ -287,10 +286,10 @@ Copy the `community_tasks/_template.yml` to `community_tasks/yourevalname.py` an Make sure you can launch your model with your new task using `--tasks community|yournewtask|2|0 --custom_tasks community_tasks/yourevalname.py`. ### Adding a new metric -First check if you can use one of the parametrized functions in `src.lighteval.metrics.metrics_corpus` or `src.lighteval.metrics.metrics_sample`. +First, check if you can use one of the parametrized functions in `src.lighteval.metrics.metrics_corpus` or `src.lighteval.metrics.metrics_sample`. If not, you can use the custom_task system to register your new metric: -- create a new python file which should contain the full logic of your metric. +- create a new Python file which should contain the full logic of your metric. - the file also needs to start with these imports ```python from aenum import extend_enum @@ -319,15 +318,15 @@ These metrics use log-likelihood of the different possible targets. - `loglikelihood_acc_norm` (Harness): Fraction of instances where the choice with the best logprob, normalized by sequence length, was correct - also exists in a faster version for tasks where the possible choices include only one token (`loglikelihood_acc_norm_single_token`) - `loglikelihood_acc_norm_nospace` (Harness): Fraction of instances where the choice with the best logprob, normalized by sequence length, was correct, with the first space ignored - `loglikelihood_f1` (Harness): Corpus level F1 score of the multichoice selection - also exists in a faster version for tasks where the possible choices include only one token (`loglikelihood_f1_single_token`) -- `mcc` (Harness): Matthew's correlation coefficient (measure of agreement between statistical distributions), +- `mcc` (Harness): Matthew's correlation coefficient (a measure of agreement between statistical distributions), - `recall_at_1` (Harness): Fraction of instances where the choice with the best logprob was correct - also exists in a faster version for tasks where the possible choices include only one token per choice (`recall_at_1_single_token`) - `recall_at_2` (Harness): Fraction of instances where the choice with the 2nd best logprob or better was correct - also exists in a faster version for tasks where the possible choices include only one token per choice (`recall_at_2_single_token`) -- `mrr` (Harness): Mean reciprocal rank, measure of the quality of a ranking of choices ordered by correctness/relevance - also exists in a faster version for tasks where the possible choices include only one token (`mrr_single_token`) +- `mrr` (Harness): Mean reciprocal rank, a measure of the quality of a ranking of choices ordered by correctness/relevance - also exists in a faster version for tasks where the possible choices include only one token (`mrr_single_token`) - `target_perplexity` (Harness): Perplexity of the different choices available. - `acc_golds_likelihood`: (Harness): A bit different, it actually checks if the average logprob of a single target is above or below 0.5 - `multi_f1_numeric`: Loglikelihood F1 score for multiple gold targets -All these metrics also exist in a "single token" version (`loglikelihood_acc_single_token`, `loglikelihood_acc_norm_single_token`, `loglikelihood_f1_single_token`, `mcc_single_token`, `recall@2_single_token` and `mrr_single_token`). When the multichoice option compare only one token (ex: "A" vs "B" vs "C" vs "D", or "yes" vs "no"), using these metrics in the single token version will divide the time spent by the number of choices. Single token evals also include: +All these metrics also exist in a "single token" version (`loglikelihood_acc_single_token`, `loglikelihood_acc_norm_single_token`, `loglikelihood_f1_single_token`, `mcc_single_token`, `recall@2_single_token` and `mrr_single_token`). When the multichoice option compares only one token (ex: "A" vs "B" vs "C" vs "D", or "yes" vs "no"), using these metrics in the single token version will divide the time spent by the number of choices. Single token evals also include: - `multi_f1_numeric` (Harness, for CB): computes the f1 score of all possible choices and averages it. ### Metrics for perplexity and language modeling @@ -341,7 +340,7 @@ These metrics use log-likelihood of prompt. These metrics need the model to generate an output. They are therefore slower. - Base: - `perfect_exact_match` (Harness): Fraction of instances where the prediction matches the gold exactly. - - `exact_match` (HELM): Fraction of instances where the prediction matches the gold at the exception of the border whitespaces (= after a `strip` has been applied to both). + - `exact_match` (HELM): Fraction of instances where the prediction matches the gold with the exception of the border whitespaces (= after a `strip` has been applied to both). - `quasi_exact_match` (HELM): Fraction of instances where the normalized prediction matches the normalized gold (normalization done on whitespace, articles, capitalization, ...). Other variations exist, with other normalizers, such as `quasi_exact_match_triviaqa`, which only normalizes the predictions after applying a strip to all sentences. - `prefix_exact_match` (HELM): Fraction of instances where the beginning of the prediction matches the gold at the exception of the border whitespaces (= after a `strip` has been applied to both). - `prefix_quasi_exact_match` (HELM): Fraction of instances where the normalized beginning of the prediction matches the normalized gold (normalization done on whitespace, articles, capitalization, ...) @@ -382,7 +381,7 @@ These metrics need the model to generate an output. They are therefore slower. - `maj_at_8_gsm8k` (Lighteval): Majority choice evaluation, using the gsm8k normalisation for the predictions and gold ### Metrics for specific tasks -To keep compatibility with the Harness for some specific tasks, we ported their evaluations more or less as such. They include `drop` (for the DROP dataset) and `truthfulqa_mc_metrics` (for TruthfulQA). In general, except for tasks where the dataset has a very different formatting than usual (an other language, programming language, math, ...), we want to use standard implementations of the above metrics. It makes little sense to have 10 different versions of an exact match depending on the task. However, most of the above metrics are parametrizable so that you can change the normalization applied easily for experimental purposes. +To keep compatibility with the Harness for some specific tasks, we ported their evaluations more or less as such. They include `drop` (for the DROP dataset) and `truthfulqa_mc_metrics` (for TruthfulQA). In general, except for tasks where the dataset has very different formatting than usual (another language, programming language, math, ...), we want to use standard implementations of the above metrics. It makes little sense to have 10 different versions of an exact match depending on the task. However, most of the above metrics are parametrizable so that you can change the normalization applied easily for experimental purposes. ### Not working yet These metrics need both the generation and its logprob. They are not working at the moment, as this fn is not in the AI Harness. @@ -458,4 +457,3 @@ python3 -m build . url = {https://github.com/huggingface/lighteval} } ``` - diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index 6e983247..6210f13e 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -23,6 +23,7 @@ """This module manages all the metrics occurring at the sample level. The results of said metrics are then aggregated using simple function (min, mean, max, ...) at the corpus level. Most metrics fall under this category. """ + import os from typing import Union @@ -327,7 +328,7 @@ def __init__( Args: methods (str | list[str]): What type of ROUGE scoring to use. Can be one or any of `rouge1`, `rouge2`, `rougeL` or `rougeLsum`. - multiple_golds (bool, optional): Whether to compute ROUGE by allowing the comparision to several golds + multiple_golds (bool, optional): Whether to compute ROUGE by allowing the comparison to several golds at once, or to compute ROUGE on individual gold/prediction pairs and aggregate afterwards. Defaults to False. bootstrap (bool, optional): Whether to use bootstrapping. Defaults to False. aggregation_function (callable, optional): How to aggregate the item results. Defaults to max. @@ -645,13 +646,13 @@ def __init__(self, judge_model_name: str, template_path: str, multi_turn: bool = def compute(self, predictions: list[str], formatted_doc: Doc, **kwargs) -> dict[str, float]: """ - Compute the score of a generative taks using a llm as a judge. + Compute the score of a generative task using a llm as a judge. The generative task can be multiturn with 2 turns max, in that case, we return scores for turn 1 and 2. Also returns user_prompt and judgment which are ignored later by the aggregator. """ - # If we are evaluating a multiturn task, we need to have specific field in the formated doc + # If we are evaluating a multiturn task, we need to have specific field in the formatted doc if self.multi_turn: questions = formatted_doc.specific["multi_turn_queries"] ref_answers = formatted_doc.specific.get("reference", None) if formatted_doc.specific is not None else None diff --git a/src/lighteval/tasks/extended/mt_bench/judge_prompts.jsonl b/src/lighteval/tasks/extended/mt_bench/judge_prompts.jsonl index 86854fff..4ec7524c 100644 --- a/src/lighteval/tasks/extended/mt_bench/judge_prompts.jsonl +++ b/src/lighteval/tasks/extended/mt_bench/judge_prompts.jsonl @@ -5,4 +5,4 @@ {"name": "single-v1", "type": "single", "system_prompt": "You are a helpful assistant.", "prompt_template": "[Instruction]\nPlease act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n[Question]\n{question}\n\n[The Start of Assistant's Answer]\n{answer}\n[The End of Assistant's Answer]", "description": "Prompt for general questions", "category": "general", "output_format": "[[rating]]"} {"name": "single-math-v1", "type": "single", "system_prompt": "You are a helpful assistant.", "prompt_template": "[Instruction]\nPlease act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider correctness and helpfulness. You will be given a reference answer and the assistant's answer. Begin your evaluation by comparing the assistant's answer with the reference answer. Identify and correct any mistakes. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n[Question]\n{question}\n\n[The Start of Reference Answer]\n{ref_answer_1}\n[The End of Reference Answer]\n\n[The Start of Assistant's Answer]\n{answer}\n[The End of Assistant's Answer]", "description": "Prompt for general questions", "category": "math", "output_format": "[[rating]]"} {"name": "single-v1-multi-turn", "type": "single", "system_prompt": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. You evaluation should focus on the assistant's answer to the second user question. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n", "prompt_template": "<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_2}\n\n<|The End of Assistant A's Conversation with User|>", "description": "Prompt for general questions", "category": "general", "output_format": "[[rating]]"} -{"name": "single-math-v1-multi-turn", "type": "single", "system_prompt": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question. Your evaluation should consider correctness and helpfulness. You will be given a reference answer and the assistant's answer. You evaluation should focus on the assistant's answer to the second question. Begin your evaluation by comparing the assistant's answer with the reference answer. Identify and correct any mistakes. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n", "prompt_template": "<|The Start of Reference Answer|>\n\n### User:\n{question_1}\n\n### Reference answer:\n{ref_answer_1}\n\n### User:\n{question_2}\n\n### Reference answer:\n{ref_answer_2}\n\n<|The End of Reference Answer|>\n\n\n<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_2}\n\n<|The End of Assistant A's Conversation with User|>", "description": "Prompt for general questions", "category": "math", "output_format": "[[rating]]"} \ No newline at end of file +{"name": "single-math-v1-multi-turn", "type": "single", "system_prompt": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question. Your evaluation should consider correctness and helpfulness. You will be given a reference answer and the assistant's answer. You evaluation should focus on the assistant's answer to the second question. Begin your evaluation by comparing the assistant's answer with the reference answer. Identify and correct any mistakes. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n", "prompt_template": "<|The Start of Reference Answer|>\n\n### User:\n{question_1}\n\n### Reference answer:\n{ref_answer_1}\n\n### User:\n{question_2}\n\n### Reference answer:\n{ref_answer_2}\n\n<|The End of Reference Answer|>\n\n\n<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_2}\n\n<|The End of Assistant A's Conversation with User|>", "description": "Prompt for general questions", "category": "math", "output_format": "[[rating]]"} From a98210fd3a2d1e8bface1c32b72ebd5017173a4c Mon Sep 17 00:00:00 2001 From: Lucain Date: Tue, 21 May 2024 14:38:07 +0200 Subject: [PATCH 13/25] Mention HF_TOKEN in readme (#194) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6b116fc7..735a62da 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,7 @@ The setup tested most is: pip install '.[accelerate,quantization,adapters]' ``` -If you want to push your results to the Hugging Face Hub, don't forget to add your access token to the environment variable `HUGGING_FACE_HUB_TOKEN`. You can do this by running: +If you want to push your results to the Hugging Face Hub, don't forget to add your access token to the environment variable `HF_TOKEN`. You can do this by running: ```shell huggingface-cli login From a3d1eea1983b306a93e913471f3312d477c54a10 Mon Sep 17 00:00:00 2001 From: Sadra Barikbin Date: Wed, 3 Jul 2024 19:04:10 +0330 Subject: [PATCH 14/25] Download BERT scorer lazily (#190) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> --- src/lighteval/metrics/imports/bert_scorer.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/lighteval/metrics/imports/bert_scorer.py b/src/lighteval/metrics/imports/bert_scorer.py index dd8c0ee8..442ee9c7 100644 --- a/src/lighteval/metrics/imports/bert_scorer.py +++ b/src/lighteval/metrics/imports/bert_scorer.py @@ -375,11 +375,9 @@ def __init__( self._model_type = model_type self._num_layers = num_layers - # Building model and tokenizer - self._tokenizer = AutoTokenizer.from_pretrained(model_type) - self._model = AutoModel.from_pretrained(model_type) - self._model.eval() - self._model.to(self.device) + # Model and tokenizer are lazily loaded in `score()`. + self._tokenizer = None + self._model = None self._idf_dict = None @@ -443,6 +441,13 @@ def score(self, cands, refs, verbose=False, batch_size=64, return_hash=False): the *best* score among all references. """ + if self._model is None: + hlog(f"Loading BERTScorer model `{self._model_type}`") + self._tokenizer = AutoTokenizer.from_pretrained(self._model_type) + self._model = AutoModel.from_pretrained(self._model_type) + self._model.eval() + self._model.to(self.device) + ref_group_boundaries = None if not isinstance(refs[0], str): ref_group_boundaries = [] From 7fcaab354853abf08d71f80340c29814dae17e91 Mon Sep 17 00:00:00 2001 From: shaltielshmid Date: Thu, 4 Jul 2024 11:23:10 +0300 Subject: [PATCH 15/25] Updated tgi_model and added parameters for endpoint_model (#208) * Added image url parameter * Fixed up tgi model config * Undid tgi available check * Adjust tgi parameter names, and checked for attr existence * Fixed task Id in argparse * Removed obfuscation from private functions, to allow inheritance to override * Updated tgi model to inherit from endpoint and just modify client calls * Added option to specify model id in config for tgi model * Added option to specify custom env vars * Updated env vras * Applied ruff format * Added docs + readme * Ruff format --- README.md | 22 +++- examples/model_configs/endpoint_model.yaml | 5 +- examples/model_configs/tgi_model.yaml | 1 + run_evals_accelerate.py | 5 +- src/lighteval/models/endpoint_model.py | 35 +++--- src/lighteval/models/model_config.py | 17 ++- src/lighteval/models/model_loader.py | 6 +- src/lighteval/models/tgi_model.py | 127 ++++++++------------- src/lighteval/utils.py | 2 +- 9 files changed, 109 insertions(+), 111 deletions(-) diff --git a/README.md b/README.md index 735a62da..b90fc976 100644 --- a/README.md +++ b/README.md @@ -139,7 +139,7 @@ accelerate launch --multi_gpu --num_processes= run_evals_accelerate.py --output_dir output_dir ``` -Examples of possible configuration files are provided in `examples/model_configs`. +You can find the template of the expected model configuration in [examples/model_configs/base_model.yaml_](./examples/model_configs/base_model.yaml). ### Evaluating a large model with pipeline parallelism @@ -182,6 +182,25 @@ python run_evals_accelerate.py \ --output_dir output_dir ``` +### Evaluate the model on a server/container. + +An alternative to launching the evaluation locally is to serve the model on a TGI-compatible server/container and then run the evaluation by sending requests to the server. The command is the same as before, except you specify a path to a yaml config file (detailed below): + +```shell +python run_evals_accelerate.py \ + --model_config_path="/path/to/config/file"\ + --tasks \ + --output_dir output_dir +``` + +There are two types of configuration files that can be provided for running on the server: + +1. [endpoint_model.yaml](./examples/model_configs/endpoint_model.yaml): This configuration allows you to launch the model using [HuggingFace's Inference Endpoints](https://huggingface.co/inference-endpoints/dedicated). You can specify in the configuration file all the relevant parameters, and then `lighteval` will automatically deploy the endpoint, run the evaluation, and finally delete the endpoint (unless you specify an endpoint that was already launched, in which case the endpoint won't be deleted afterwards). + +2. [tgi_model.yaml](./examples/model_configs/tgi_model.yaml): This configuration lets you specify the URL of a model running in a TGI container, such as one deployed on HuggingFace's serverless inference. + +Templates for these configurations can be found in [examples/model_configs](./examples/model_configs/). + ### Evaluate a model on extended, community, or custom tasks. Independently of the default tasks provided in `lighteval` that you will find in the `tasks_table.jsonl` file, you can use `lighteval` to evaluate models on tasks that require special processing (or have been added by the community). These tasks have their own evaluation suites and are defined as follows: @@ -190,7 +209,6 @@ Independently of the default tasks provided in `lighteval` that you will find in * `community`: tasks that have been added by the community. See the [`community_tasks`](./community_tasks) folder for examples. * `custom`: tasks that are defined locally and not present in the core library. Use this suite if you want to experiment with designing a special metric or task. - For example, to run an extended task like `ifeval`, you can run: ```shell python run_evals_accelerate.py \ diff --git a/examples/model_configs/endpoint_model.yaml b/examples/model_configs/endpoint_model.yaml index cc05dcf5..9e0db437 100644 --- a/examples/model_configs/endpoint_model.yaml +++ b/examples/model_configs/endpoint_model.yaml @@ -5,7 +5,7 @@ model: model: "meta-llama/Llama-2-7b-hf" revision: "main" dtype: "float16" # can be any of "awq", "eetq", "gptq", "4bit' or "8bit" (will use bitsandbytes), "bfloat16" or "float16" - reuse_existing: false # if true, ignore all params in instance + reuse_existing: false # if true, ignore all params in instance, and don't delete the endpoint after evaluation instance: accelerator: "gpu" region: "eu-west-1" @@ -15,5 +15,8 @@ model: framework: "pytorch" endpoint_type: "protected" namespace: null # The namespace under which to launch the endopint. Defaults to the current user's namespace + image_url: null # Optionally specify the docker image to use when launching the endpoint model. E.g., launching models with later releases of the TGI container with support for newer models. + env_vars: + null # Optional environment variables to include when launching the endpoint. e.g., `MAX_INPUT_LENGTH: 2048` generation: add_special_tokens: true diff --git a/examples/model_configs/tgi_model.yaml b/examples/model_configs/tgi_model.yaml index 4cfb8086..5e45641f 100644 --- a/examples/model_configs/tgi_model.yaml +++ b/examples/model_configs/tgi_model.yaml @@ -3,3 +3,4 @@ model: instance: inference_server_address: "" inference_server_auth: null + model_id: null # Optional, only required if the TGI container was launched with model_id pointing to a local directory \ No newline at end of file diff --git a/run_evals_accelerate.py b/run_evals_accelerate.py index a743cb49..23e46cb0 100644 --- a/run_evals_accelerate.py +++ b/run_evals_accelerate.py @@ -20,10 +20,11 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -""" Example run command: +"""Example run command: accelerate config accelerate launch run_evals_accelerate.py --tasks="leaderboard|hellaswag|5|1" --output_dir "/scratch/evals" --model_args "pretrained=gpt2" """ + import argparse from lighteval.main_accelerate import CACHE_DIR, main @@ -70,7 +71,7 @@ def get_parser(): "--tasks", type=str, default=None, - help="Id of a task, e.g. 'original|mmlu:abstract_algebra|5' or path to a texte file with a list of tasks", + help="Id of a task, e.g. 'original|mmlu:abstract_algebra|5|0' or path to a texte file with a list of tasks", ) parser.add_argument("--num_fewshot_seeds", type=int, default=1, help="Number of trials the few shots") return parser diff --git a/src/lighteval/models/endpoint_model.py b/src/lighteval/models/endpoint_model.py index d79e0f91..b7e9af31 100644 --- a/src/lighteval/models/endpoint_model.py +++ b/src/lighteval/models/endpoint_model.py @@ -92,8 +92,9 @@ def __init__( "MAX_TOTAL_TOKENS": "2048", "MODEL_ID": "/repository", **config.get_dtype_args(), + **config.get_custom_env_vars(), }, - "url": "ghcr.io/huggingface/text-generation-inference:1.1.0", + "url": (config.image_url or "ghcr.io/huggingface/text-generation-inference:1.1.0"), }, ) hlog("Deploying your endpoint. Please wait.") @@ -149,7 +150,7 @@ def max_length(self): self._max_length = 2048 return self._max_length - def __async_process_request( + def _async_process_request( self, context: str, stop_tokens: list[str], max_tokens: int ) -> Coroutine[None, list[TextGenerationOutput], str]: # Todo: add an option to launch with conversational instead for chat prompts @@ -165,7 +166,7 @@ def __async_process_request( return generated_text - def __process_request(self, context: str, stop_tokens: list[str], max_tokens: int) -> TextGenerationOutput: + def _process_request(self, context: str, stop_tokens: list[str], max_tokens: int) -> TextGenerationOutput: # Todo: add an option to launch with conversational instead for chat prompts # https://huggingface.co/docs/huggingface_hub/v0.20.3/en/package_reference/inference_client#huggingface_hub.AsyncInferenceClient.conversational generated_text = self.client.text_generation( @@ -179,13 +180,13 @@ def __process_request(self, context: str, stop_tokens: list[str], max_tokens: in return generated_text - async def __async_process_batch_generate( + async def _async_process_batch_generate( self, requests: list[GreedyUntilRequest], ) -> list[TextGenerationOutput]: return await asyncio.gather( *[ - self.__async_process_request( + self._async_process_request( context=request.context, stop_tokens=as_list(request.stop_sequence), max_tokens=request.generation_size, @@ -194,12 +195,12 @@ async def __async_process_batch_generate( ] ) - def __process_batch_generate( + def _process_batch_generate( self, requests: list[GreedyUntilRequest], ) -> list[TextGenerationOutput]: return [ - self.__process_request( + self._process_request( context=request.context, stop_tokens=as_list(request.stop_sequence), max_tokens=request.generation_size, @@ -207,12 +208,12 @@ def __process_batch_generate( for request in requests ] - async def __async_process_batch_logprob( + async def _async_process_batch_logprob( self, requests: list[LoglikelihoodRequest], rolling: bool = False ) -> list[TextGenerationOutput]: return await asyncio.gather( *[ - self.__async_process_request( + self._async_process_request( context=request.context if rolling else request.context + request.choice, stop_tokens=[], max_tokens=1, @@ -221,11 +222,11 @@ async def __async_process_batch_logprob( ] ) - def __process_batch_logprob( + def _process_batch_logprob( self, requests: list[LoglikelihoodRequest], rolling: bool = False ) -> list[TextGenerationOutput]: return [ - self.__process_request( + self._process_request( context=request.context if rolling else request.context + request.choice, stop_tokens=[], max_tokens=1, @@ -267,9 +268,9 @@ def greedy_until( ) if self.use_async: - responses = asyncio.run(self.__async_process_batch_generate(batch)) + responses = asyncio.run(self._async_process_batch_generate(batch)) else: - responses = self.__process_batch_generate(batch) + responses = self._process_batch_generate(batch) for response in responses: results.append( GenerateReturn( @@ -303,9 +304,9 @@ def loglikelihood( for batch in tqdm(dataloader, desc="Loglikelihoods", position=1, leave=False, disable=self.disable_tqdm): if self.use_async: - responses = asyncio.run(self.__async_process_batch_logprob(batch)) + responses = asyncio.run(self._async_process_batch_logprob(batch)) else: - responses = self.__process_batch_logprob(batch) + responses = self._process_batch_logprob(batch) for cur_request, response in zip(batch, responses): cont_toks = torch.tensor(cur_request.tokenized_continuation) len_choice = len(cont_toks) @@ -351,9 +352,9 @@ def loglikelihood_rolling( dataloader, desc="Loglikelihoods, rolling", position=1, leave=False, disable=self.disable_tqdm ): if self.use_async: - responses = asyncio.run(self.__async_process_batch_logprob(batch, rolling=True)) + responses = asyncio.run(self._async_process_batch_logprob(batch, rolling=True)) else: - responses = self.__process_batch_logprob(batch, rolling=True) + responses = self._process_batch_logprob(batch, rolling=True) for response in responses: logits = [t.logprob for t in response.details.tokens[:-1]] diff --git a/src/lighteval/models/model_config.py b/src/lighteval/models/model_config.py index 5cb7c89d..f2736e1a 100644 --- a/src/lighteval/models/model_config.py +++ b/src/lighteval/models/model_config.py @@ -200,6 +200,7 @@ def init_configs(self, env_config: EnvConfig): class TGIModelConfig: inference_server_address: str inference_server_auth: str + model_id: str @dataclass @@ -224,6 +225,8 @@ class InferenceEndpointModelConfig: add_special_tokens: bool = True revision: str = "main" namespace: str = None # The namespace under which to launch the endopint. Defaults to the current user's namespace + image_url: str = None + env_vars: dict = None def get_dtype_args(self) -> Dict[str, str]: model_dtype = self.model_dtype.lower() @@ -237,6 +240,9 @@ def get_dtype_args(self) -> Dict[str, str]: return {"DTYPE": model_dtype} return {} + def get_custom_env_vars(self) -> Dict[str, str]: + return {k: str(v) for k, v in self.env_vars.items()} if self.env_vars else {} + @staticmethod def nullable_keys() -> list[str]: """ @@ -244,7 +250,7 @@ def nullable_keys() -> list[str]: keys be specified in the configuration in order to launch the endpoint. This function returns the list of keys that are not required and can remain None. """ - return ["namespace"] + return ["namespace", "env_vars", "image_url"] def create_model_config(args: Namespace, accelerator: Union["Accelerator", None]) -> BaseModelConfig: # noqa: C901 @@ -271,7 +277,7 @@ def create_model_config(args: Namespace, accelerator: Union["Accelerator", None] return BaseModelConfig(**args_dict) - if args.model_config: + if hasattr(args, "model_config") and args.model_config: config = args.model_config["model"] else: with open(args.model_config_path, "r") as f: @@ -279,8 +285,9 @@ def create_model_config(args: Namespace, accelerator: Union["Accelerator", None] if config["type"] == "tgi": return TGIModelConfig( - inference_server_address=args["instance"]["inference_server_address"], - inference_server_auth=args["instance"]["inference_server_auth"], + inference_server_address=config["instance"]["inference_server_address"], + inference_server_auth=config["instance"]["inference_server_auth"], + model_id=config["instance"]["model_id"], ) if config["type"] == "endpoint": @@ -303,6 +310,8 @@ def create_model_config(args: Namespace, accelerator: Union["Accelerator", None] instance_size=config["instance"]["instance_size"], instance_type=config["instance"]["instance_type"], namespace=config["instance"]["namespace"], + image_url=config["instance"].get("image_url", None), + env_vars=config["instance"].get("env_vars", None), ) return InferenceModelConfig(model=config["base_params"]["endpoint_name"]) diff --git a/src/lighteval/models/model_loader.py b/src/lighteval/models/model_loader.py index 3af5be26..dd55b424 100644 --- a/src/lighteval/models/model_loader.py +++ b/src/lighteval/models/model_loader.py @@ -88,10 +88,12 @@ def load_model_with_tgi(config: TGIModelConfig): raise ImportError(NO_TGI_ERROR_MSG) hlog(f"Load model from inference server: {config.inference_server_address}") - model = ModelClient(address=config.inference_server_address, auth_token=config.inference_server_auth) + model = ModelClient( + address=config.inference_server_address, auth_token=config.inference_server_auth, model_id=config.model_id + ) model_name = str(model.model_info["model_id"]) model_sha = model.model_info["model_sha"] - model_precision = model.model_info["dtype"] + model_precision = model.model_info["model_dtype"] model_size = -1 model_info = ModelInfo( model_name=model_name, diff --git a/src/lighteval/models/tgi_model.py b/src/lighteval/models/tgi_model.py index 5d519667..75415258 100644 --- a/src/lighteval/models/tgi_model.py +++ b/src/lighteval/models/tgi_model.py @@ -21,15 +21,14 @@ # SOFTWARE. import asyncio -import math -from typing import Coroutine, List, Tuple, Union +from typing import Coroutine -import numpy as np import requests -from tqdm import tqdm +from huggingface_hub import TextGenerationOutput from transformers import AutoTokenizer -from lighteval.utils import NO_TGI_ERROR_MSG, as_list, is_tgi_available +from lighteval.models.endpoint_model import InferenceEndpointModel +from lighteval.utils import NO_TGI_ERROR_MSG, is_tgi_available if is_tgi_available(): @@ -45,99 +44,63 @@ def divide_chunks(array, n): yield array[i : i + n] -class ModelClient: +# inherit from InferenceEndpointModel instead of LightevalModel since they both use the same interface, and only overwrite +# the client functions, since they use a different client. +class ModelClient(InferenceEndpointModel): _DEFAULT_MAX_LENGTH: int = 4096 - def __init__( - self, - address, - auth_token=None, - ) -> None: + def __init__(self, address, auth_token=None, model_id=None) -> None: if not is_tgi_available(): raise ImportError(NO_TGI_ERROR_MSG) - headers = {} if auth_token is None else {"Authorization": f"Basic {auth_token}"} + headers = {} if auth_token is None else {"Authorization": f"Bearer {auth_token}"} self.client = AsyncClient(address, headers=headers, timeout=240) self._max_gen_toks = 256 - self.model_info = requests.get(f"{address}/info").json() - self.tokenizer = AutoTokenizer.from_pretrained(self.model_info["model_id"]) - - def __process_request_generate(self, request: Tuple[str, Union[Tuple, List]]) -> Coroutine[None, List, str]: - context, stopping_arugments = request - - if isinstance(stopping_arugments, tuple): - stop_sequence_arg, max_gen_tokens_arg = stopping_arugments - stop_sequences = as_list(stop_sequence_arg) - # Todo @clefourrier add proper messaging explaining this - # we don't want people to be surprised because they set a max len in the model overwritten by the eval - max_tokens = max_gen_tokens_arg - else: - stop_sequences = as_list(stopping_arugments) - max_tokens = self._max_gen_toks - - if stop_sequences is None or stop_sequences == [None]: - stop_sequences = [] - + self.model_info = requests.get(f"{address}/info", headers=headers).json() + if "model_id" not in self.model_info: + raise ValueError("Error occured when fetching info: " + str(self.model_info)) + if model_id: + self.model_info["model_id"] = model_id + self._tokenizer = AutoTokenizer.from_pretrained(self.model_info["model_id"]) + self._add_special_tokens = True + self.use_async = True + + def _async_process_request( + self, context: str, stop_tokens: list[str], max_tokens: int + ) -> Coroutine[None, list[TextGenerationOutput], str]: + # Todo: add an option to launch with conversational instead for chat prompts generated_text = self.client.generate( - context, - max_new_tokens=max_tokens, + prompt=context, decoder_input_details=True, - stop_sequences=stop_sequences, - seed=42, - truncate=ModelClient._DEFAULT_MAX_LENGTH, + max_new_tokens=max_tokens, + stop_sequences=stop_tokens, ) return generated_text - async def __process_batch_generate(self, requests: List[Tuple[str, Union[Tuple, List]]]): - return await asyncio.gather(*[self.__process_request_generate(request) for request in requests]) - - def greedy_until(self, requests: List[Tuple[str, Union[Tuple, List]]], override_bs=None) -> List[str]: - generated_texts: List[str] = [] - - batch_size = override_bs if override_bs > 0 else BATCH_SIZE - - for batch in tqdm( - divide_chunks(requests, batch_size), total=math.ceil(len(requests) // batch_size), maxinterval=2 - ): - results = asyncio.run(self.__process_batch_generate(batch)) - generated_texts.extend([result.generated_text for result in results]) - - return generated_texts + def _process_request(self, *args, **kwargs) -> TextGenerationOutput: + return asyncio.run(self._async_process_request(*args, **kwargs)) - def __process_request_logprob(self, request: Tuple[str, str]) -> Coroutine[None, List, str]: - context, choice = request - out = self.client.generate(context + choice, max_new_tokens=1, decoder_input_details=True) - return out - - async def __process_batch_logprob(self, requests: List[Tuple[str, str]]): - return await asyncio.gather(*[self.__process_request_logprob(request) for request in requests]) - - def loglikelihood(self, requests: List[Tuple[str, str]], override_bs=None) -> List[Tuple[float, bool]]: - res: List[Tuple[float, bool]] = [] - - batch_size = override_bs if override_bs > 0 else BATCH_SIZE - - for batch in tqdm( - divide_chunks(requests, batch_size), total=math.ceil(len(requests) // batch_size), maxinterval=1 - ): - results = asyncio.run(self.__process_batch_logprob(batch)) - details = [result.details.prefill for result in results] - - for detail, (context, choice) in zip(details, batch): - tokenized_context = self.tokenizer.tokenize(context, add_special_tokens=True) - tokenized_input = self.tokenizer.tokenize(context + choice, add_special_tokens=True) + def set_cache_hook(self, cache_hook): + self.cache_hook = cache_hook - i = 0 - while i < len(tokenized_context) and tokenized_input[i] == tokenized_context[i]: - i += 1 + @property + def tokenizer(self): + return self._tokenizer - logprobs = [token.logprob for token in detail[i:]] + @property + def add_special_tokens(self): + return self._add_special_tokens - logit_sum: float = np.sum(logprobs) - res.append((logit_sum, False)) + @property + def max_length(self) -> int: + if hasattr(self.tokenizer, "model_max_length"): + return self.tokenizer.model_max_length + return ModelClient._DEFAULT_MAX_LENGTH - return res + @property + def disable_tqdm(self) -> bool: + False - def set_cache_hook(self, cache_hook): - self.cache_hook = cache_hook + def cleanup(self): + pass diff --git a/src/lighteval/utils.py b/src/lighteval/utils.py index d3c32e99..3380fc9a 100644 --- a/src/lighteval/utils.py +++ b/src/lighteval/utils.py @@ -153,7 +153,7 @@ def is_accelerate_available() -> bool: def is_tgi_available() -> bool: - return importlib.util.find_spec("text-generation") is not None + return importlib.util.find_spec("text_generation") is not None NO_TGI_ERROR_MSG = "You are trying to start a text generation inference endpoint, but text-generation is not present in your local environement. Please install it using pip." From 3a808336e19ea8785aabdb259476634eb2e8f33c Mon Sep 17 00:00:00 2001 From: Nathan Habib <30601243+NathanHB@users.noreply.github.com> Date: Thu, 4 Jul 2024 12:48:24 +0200 Subject: [PATCH 16/25] fix llm as judge warnings (#173) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * commit * fixes * fix style * fixes * make style * Fix import error detection for open ai package (llm as a judge metric) --------- Co-authored-by: Nathan Habib Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> --- .../mt_bench => metrics}/judge_prompts.jsonl | 0 src/lighteval/metrics/llm_as_judge.py | 14 ++++++++++--- src/lighteval/metrics/metrics.py | 10 ++++++---- src/lighteval/metrics/metrics_sample.py | 20 ++++++++----------- src/lighteval/tasks/extended/mt_bench/main.py | 2 +- src/lighteval/tasks/lighteval_task.py | 16 ++++++++++++++- src/lighteval/utils.py | 7 +++++++ 7 files changed, 48 insertions(+), 21 deletions(-) rename src/lighteval/{tasks/extended/mt_bench => metrics}/judge_prompts.jsonl (100%) diff --git a/src/lighteval/tasks/extended/mt_bench/judge_prompts.jsonl b/src/lighteval/metrics/judge_prompts.jsonl similarity index 100% rename from src/lighteval/tasks/extended/mt_bench/judge_prompts.jsonl rename to src/lighteval/metrics/judge_prompts.jsonl diff --git a/src/lighteval/metrics/llm_as_judge.py b/src/lighteval/metrics/llm_as_judge.py index 12b637a3..5b70e9d5 100644 --- a/src/lighteval/metrics/llm_as_judge.py +++ b/src/lighteval/metrics/llm_as_judge.py @@ -27,9 +27,8 @@ import time from typing import Optional -from openai import OpenAI - from lighteval.logging.hierarchical_logger import hlog_warn +from lighteval.utils import NO_OPENAI_ERROR_MSG, is_openai_available class JudgeOpenAI: @@ -70,7 +69,8 @@ def __init__( openai_api_key: str, multi_turn: bool = False, ): - self.client = OpenAI(api_key=openai_api_key) + self.client = None # loaded lazily + self.openai_api_key = openai_api_key self.model = model self.seed = seed self.temperature = temperature @@ -112,6 +112,14 @@ def evaluate_answer( Raises: Exception: If an error occurs during the API call. """ + if self.client is None: + if not is_openai_available(): + raise ImportError(NO_OPENAI_ERROR_MSG) + + from openai import OpenAI + + self.client = OpenAI(api_key=self.openai_api_key) + prompts = [ self.__get_prompts_single_turn( questions[0], answers[0], references[0] if references is not None and len(references) > 0 else None diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py index f7eaedba..f970e850 100644 --- a/src/lighteval/metrics/metrics.py +++ b/src/lighteval/metrics/metrics.py @@ -20,6 +20,8 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +import os + import numpy as np from aenum import Enum @@ -225,14 +227,14 @@ class Metrics(Enum): corpus_level_fn=np.mean, higher_is_better=True, ) - llm_judge_multi_turn = SampleLevelMetricGrouping( + llm_judge_multi_turn_openai = SampleLevelMetricGrouping( metric=["single_turn", "multi_turn"], higher_is_better=True, category=MetricCategory.LLM_AS_JUDGE_MULTI_TURN, use_case=MetricUseCase.SUMMARIZATION, sample_level_fn=JudgeLLM( judge_model_name="gpt-3.5-turbo", - template_path="src/lighteval/tasks/extended/mt_bench/judge_prompts.jsonl", + template_path=os.path.join(os.path.dirname(__file__), "judge_prompts.jsonl"), multi_turn=True, ).compute, corpus_level_fn={ @@ -240,14 +242,14 @@ class Metrics(Enum): "multi_turn": np.mean, }, ) - llm_judge = SampleLevelMetricGrouping( + llm_judge_openai = SampleLevelMetricGrouping( metric=["judge_score"], higher_is_better=True, category=MetricCategory.LLM_AS_JUDGE, use_case=MetricUseCase.SUMMARIZATION, sample_level_fn=JudgeLLM( judge_model_name="gpt-3.5-turbo", - template_path="src/lighteval/tasks/extended/mt_bench/judge_prompts.jsonl", + template_path=os.path.join(os.path.dirname(__file__), "", "judge_prompts.jsonl"), multi_turn=False, ).compute, corpus_level_fn={ diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index 6210f13e..1a52d6fd 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -631,18 +631,14 @@ def __init__(self, judge_model_name: str, template_path: str, multi_turn: bool = OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") self.multi_turn = multi_turn - try: - self.judge = JudgeOpenAI( - model=judge_model_name, - seed=42, - temperature=0.0, - templates_path=template_path, - openai_api_key=OPENAI_API_KEY, - multi_turn=multi_turn, - ) - except Exception as e: - print(f"Could not initialize the JudgeOpenAI model:\n{e}") - self.judge = None + self.judge = JudgeOpenAI( + model=judge_model_name, + seed=42, + temperature=0.0, + templates_path=template_path, + openai_api_key=OPENAI_API_KEY, + multi_turn=multi_turn, + ) def compute(self, predictions: list[str], formatted_doc: Doc, **kwargs) -> dict[str, float]: """ diff --git a/src/lighteval/tasks/extended/mt_bench/main.py b/src/lighteval/tasks/extended/mt_bench/main.py index ec8347b7..a0ce741c 100644 --- a/src/lighteval/tasks/extended/mt_bench/main.py +++ b/src/lighteval/tasks/extended/mt_bench/main.py @@ -45,7 +45,7 @@ evaluation_splits=["train"], few_shots_split="", few_shots_select="random", - metric=["llm_judge_multi_turn"], + metric=["llm_judge_multi_turn_openai"], generation_size=1024, stop_sequence=[], ) diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py index f5c7a1f9..85f4e025 100644 --- a/src/lighteval/tasks/lighteval_task.py +++ b/src/lighteval/tasks/lighteval_task.py @@ -21,6 +21,7 @@ # SOFTWARE. import collections +import os import random from dataclasses import dataclass from multiprocessing import Pool @@ -53,7 +54,7 @@ RequestType, TaskExampleId, ) -from lighteval.utils import as_list +from lighteval.utils import NO_OPENAI_ERROR_MSG, as_list, is_openai_available from . import tasks_prompt_formatting @@ -200,8 +201,21 @@ def __init__( # noqa: C901 self.metrics = as_list(cfg.metric) self.suite = as_list(cfg.suite) ignored = [metric for metric in self.metrics if Metrics[metric].value.category == MetricCategory.IGNORED] + if len(ignored) > 0: hlog_warn(f"[WARNING] Not implemented yet: ignoring the metric {' ,'.join(ignored)} for task {self.name}.") + + if any( + Metrics[metric].value.category in [MetricCategory.LLM_AS_JUDGE, MetricCategory.LLM_AS_JUDGE_MULTI_TURN] + for metric in self.metrics + ): + if not is_openai_available(): + raise ImportError(NO_OPENAI_ERROR_MSG) + if os.getenv("OPENAI_API_KEY") is None: + raise ValueError( + "Using llm as judge metric but no OPEN_API_KEY were found, please set it with: export OPEN_API_KEY={yourkey}" + ) + current_categories = [Metrics[metric].value.category for metric in self.metrics] self.has_metric_category = {category: (category in current_categories) for category in MetricCategory} # Sub-optimal system - we might want to store metric parametrisation in a yaml conf for example diff --git a/src/lighteval/utils.py b/src/lighteval/utils.py index 3380fc9a..768a1cd8 100644 --- a/src/lighteval/utils.py +++ b/src/lighteval/utils.py @@ -191,6 +191,13 @@ def is_peft_available() -> bool: NO_PEFT_ERROR_MSG = "You are trying to use adapter weights models, for which you need `peft`, which is not available in your environment. Please install it using pip." +def is_openai_available() -> bool: + return importlib.util.find_spec("openai") is not None + + +NO_OPENAI_ERROR_MSG = "You are trying to use an Open AI LLM as a judge, for which you need `openai`, which is not available in your environment. Please install it using pip." + + def can_load_extended_tasks() -> bool: imports = [] for package in ["langdetect"]: From 0bceaee026bb029cf3ea14ff5a1dc032abcd5543 Mon Sep 17 00:00:00 2001 From: Philipp Schmid <32632186+philschmid@users.noreply.github.com> Date: Thu, 4 Jul 2024 16:38:51 +0200 Subject: [PATCH 17/25] ADD GPT-4 as Judge (#206) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * ADD GPT-4 as Judge * Fix style --------- Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> --- src/lighteval/metrics/metrics_sample.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index 1a52d6fd..ef3798e4 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -622,7 +622,7 @@ def edit_similarity(self, s1, s2): class JudgeLLM: - available_models = ["gpt-3.5-turbo"] + available_models = ["gpt-3.5-turbo", "gpt-4o", "gpt-4-turbo", "gpt-4"] def __init__(self, judge_model_name: str, template_path: str, multi_turn: bool = False): if judge_model_name not in self.available_models: From 843a0f8c7cb9de20f0188c86a06a0eb429c36974 Mon Sep 17 00:00:00 2001 From: Sadra Barikbin Date: Fri, 5 Jul 2024 10:27:37 +0330 Subject: [PATCH 18/25] Fix a few typos and do a tiny refactor (#187) --- run_evals_accelerate.py | 11 ++++++++--- src/lighteval/evaluator.py | 2 +- src/lighteval/logging/evaluation_tracker.py | 2 +- src/lighteval/metrics/imports/bert_scorer.py | 2 +- src/lighteval/metrics/judge_prompts.jsonl | 4 ++-- src/lighteval/metrics/metrics.py | 2 +- src/lighteval/metrics/metrics_sample.py | 2 +- src/lighteval/models/base_model.py | 8 ++++---- src/lighteval/models/model_config.py | 11 ++++------- src/lighteval/models/model_loader.py | 4 ++-- src/lighteval/models/nanotron_model.py | 4 ++-- src/lighteval/tasks/lighteval_task.py | 4 ++-- src/lighteval/tasks/registry.py | 6 +++--- src/lighteval/tasks/requests.py | 6 ++---- 14 files changed, 34 insertions(+), 34 deletions(-) diff --git a/run_evals_accelerate.py b/run_evals_accelerate.py index 23e46cb0..20b6ec9f 100644 --- a/run_evals_accelerate.py +++ b/run_evals_accelerate.py @@ -51,7 +51,12 @@ def get_parser(): parser.add_argument( "--public_run", default=False, action="store_true", help="Push results and details to a public repo" ) - parser.add_argument("--cache_dir", type=str, default=CACHE_DIR) + parser.add_argument( + "--cache_dir", + type=str, + default=CACHE_DIR, + help="Cache directory for downloaded datasets & model, defaults to `HF_HOME` environment variable", + ) parser.add_argument( "--results_org", type=str, @@ -65,13 +70,13 @@ def get_parser(): "--custom_tasks", type=str, default=None, - help="Path to a file with custom tasks (a TASK list of dict and potentially prompt formating functions)", + help="Path to a file with custom tasks (a TASK list of dict and potentially prompt formatting functions)", ) group.add_argument( "--tasks", type=str, default=None, - help="Id of a task, e.g. 'original|mmlu:abstract_algebra|5|0' or path to a texte file with a list of tasks", + help="Comma-separated ids of tasks, e.g. 'original|mmlu:abstract_algebra|5' or path to a text file with a list of tasks", ) parser.add_argument("--num_fewshot_seeds", type=int, default=1, help="Number of trials the few shots") return parser diff --git a/src/lighteval/evaluator.py b/src/lighteval/evaluator.py index e837b922..883e5ef7 100644 --- a/src/lighteval/evaluator.py +++ b/src/lighteval/evaluator.py @@ -67,7 +67,7 @@ def evaluate( # noqa: C901 # A request output tupe is a Tuple where the first element is the index of # the request for one document of one task i.e. # task: "arc_easy", doc: "0"# request: "0" -> request_index = 0, - # We can have multiple request per doc for multi choice tasks for example. + # We can have multiple requests per doc for multi choice tasks for example. # all responses for each (task, doc) RequestIndexModelResponseTuple = collections.namedtuple( diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py index 35a835bc..f4bdf956 100644 --- a/src/lighteval/logging/evaluation_tracker.py +++ b/src/lighteval/logging/evaluation_tracker.py @@ -511,7 +511,7 @@ def push_results_to_tensorboard( # noqa: C901 self, results: dict[str, dict[str, float]], details: dict[str, DetailsLogger.CompiledDetail] ): if not is_nanotron_available(): - hlog_warn("You cannot push results to tensorboard with having nanotron installed. Skipping") + hlog_warn("You cannot push results to tensorboard without having nanotron installed. Skipping") return config: Config = self.general_config_logger.config lighteval_config = config.lighteval diff --git a/src/lighteval/metrics/imports/bert_scorer.py b/src/lighteval/metrics/imports/bert_scorer.py index 442ee9c7..a5226e48 100644 --- a/src/lighteval/metrics/imports/bert_scorer.py +++ b/src/lighteval/metrics/imports/bert_scorer.py @@ -163,7 +163,7 @@ def greedy_cos_idf( - :param: `ref_masks` (torch.LongTensor): BxKxK, BERT attention mask for reference sentences. - :param: `ref_idf` (torch.Tensor): BxK, idf score of each word - piece in the reference setence + piece in the reference sentence - :param: `hyp_embedding` (torch.Tensor): embeddings of candidate sentences, BxKxd, B: batch size, K: longest length, d: bert dimenison diff --git a/src/lighteval/metrics/judge_prompts.jsonl b/src/lighteval/metrics/judge_prompts.jsonl index 4ec7524c..a43ef34c 100644 --- a/src/lighteval/metrics/judge_prompts.jsonl +++ b/src/lighteval/metrics/judge_prompts.jsonl @@ -4,5 +4,5 @@ {"name": "pair-math-v1-multi-turn", "type": "pairwise", "system_prompt": "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user questions. Your evaluation should consider correctness and helpfulness. You will be given reference answers, the assistant A's answers, the assistant B's answers. Your job is to determine which assistant provides correct and helpful answers to the second user question. Begin your evaluation by comparing both assistants' answers with the reference answers. Identify and correct any mistakes. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: \"[[A]]\" if assistant A is better, \"[[B]]\" if assistant B is better, and \"[[C]]\" for a tie.", "prompt_template": "<|The Start of Reference Answer|>\n\n### User:\n{question_1}\n\n### Reference answer:\n{ref_answer_1}\n\n### User:\n{question_2}\n\n### Reference answer:\n{ref_answer_2}\n\n<|The End of Reference Answer|>\n\n\n<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_a_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_a_2}\n\n<|The End of Assistant A's Conversation with User|>\n\n\n<|The Start of Assistant B's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant B:\n{answer_b_1}\n\n### User:\n{question_2}\n\n### Assistant B:\n{answer_b_2}\n\n<|The End of Assistant B's Conversation with User|>", "description": "Prompt for multi-turn general questions", "category": "general", "output_format": "[[A]]"} {"name": "single-v1", "type": "single", "system_prompt": "You are a helpful assistant.", "prompt_template": "[Instruction]\nPlease act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n[Question]\n{question}\n\n[The Start of Assistant's Answer]\n{answer}\n[The End of Assistant's Answer]", "description": "Prompt for general questions", "category": "general", "output_format": "[[rating]]"} {"name": "single-math-v1", "type": "single", "system_prompt": "You are a helpful assistant.", "prompt_template": "[Instruction]\nPlease act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider correctness and helpfulness. You will be given a reference answer and the assistant's answer. Begin your evaluation by comparing the assistant's answer with the reference answer. Identify and correct any mistakes. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n[Question]\n{question}\n\n[The Start of Reference Answer]\n{ref_answer_1}\n[The End of Reference Answer]\n\n[The Start of Assistant's Answer]\n{answer}\n[The End of Assistant's Answer]", "description": "Prompt for general questions", "category": "math", "output_format": "[[rating]]"} -{"name": "single-v1-multi-turn", "type": "single", "system_prompt": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. You evaluation should focus on the assistant's answer to the second user question. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n", "prompt_template": "<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_2}\n\n<|The End of Assistant A's Conversation with User|>", "description": "Prompt for general questions", "category": "general", "output_format": "[[rating]]"} -{"name": "single-math-v1-multi-turn", "type": "single", "system_prompt": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question. Your evaluation should consider correctness and helpfulness. You will be given a reference answer and the assistant's answer. You evaluation should focus on the assistant's answer to the second question. Begin your evaluation by comparing the assistant's answer with the reference answer. Identify and correct any mistakes. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n", "prompt_template": "<|The Start of Reference Answer|>\n\n### User:\n{question_1}\n\n### Reference answer:\n{ref_answer_1}\n\n### User:\n{question_2}\n\n### Reference answer:\n{ref_answer_2}\n\n<|The End of Reference Answer|>\n\n\n<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_2}\n\n<|The End of Assistant A's Conversation with User|>", "description": "Prompt for general questions", "category": "math", "output_format": "[[rating]]"} +{"name": "single-v1-multi-turn", "type": "single", "system_prompt": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. Your evaluation should focus on the assistant's answer to the second user question. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n", "prompt_template": "<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_2}\n\n<|The End of Assistant A's Conversation with User|>", "description": "Prompt for general questions", "category": "general", "output_format": "[[rating]]"} +{"name": "single-math-v1-multi-turn", "type": "single", "system_prompt": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question. Your evaluation should consider correctness and helpfulness. You will be given a reference answer and the assistant's answer. Your evaluation should focus on the assistant's answer to the second question. Begin your evaluation by comparing the assistant's answer with the reference answer. Identify and correct any mistakes. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n", "prompt_template": "<|The Start of Reference Answer|>\n\n### User:\n{question_1}\n\n### Reference answer:\n{ref_answer_1}\n\n### User:\n{question_2}\n\n### Reference answer:\n{ref_answer_2}\n\n<|The End of Reference Answer|>\n\n\n<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_2}\n\n<|The End of Assistant A's Conversation with User|>", "description": "Prompt for general questions", "category": "math", "output_format": "[[rating]]"} diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py index f970e850..262b20a0 100644 --- a/src/lighteval/metrics/metrics.py +++ b/src/lighteval/metrics/metrics.py @@ -249,7 +249,7 @@ class Metrics(Enum): use_case=MetricUseCase.SUMMARIZATION, sample_level_fn=JudgeLLM( judge_model_name="gpt-3.5-turbo", - template_path=os.path.join(os.path.dirname(__file__), "", "judge_prompts.jsonl"), + template_path=os.path.join(os.path.dirname(__file__), "judge_prompts.jsonl"), multi_turn=False, ).compute, corpus_level_fn={ diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index ef3798e4..b7876dbc 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -644,7 +644,7 @@ def compute(self, predictions: list[str], formatted_doc: Doc, **kwargs) -> dict[ """ Compute the score of a generative task using a llm as a judge. The generative task can be multiturn with 2 turns max, in that case, we - return scores for turn 1 and 2. Also returns user_prompt and judgment + return scores for turn 1 and 2. Also returns user_prompt and judgement which are ignored later by the aggregator. """ diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py index 3913fd80..df7b3e92 100644 --- a/src/lighteval/models/base_model.py +++ b/src/lighteval/models/base_model.py @@ -79,7 +79,7 @@ def __init__( self._add_special_tokens = config.add_special_tokens if config.add_special_tokens is not None else False self._tokenizer = self._create_auto_tokenizer(config, env_config) - # If model_parallel is not set we compare the number of process with the number of GPUs + # If model_parallel is not set we compare the number of processes with the number of GPUs self.model = self._create_auto_model(config, env_config) self.model.eval() torch.set_grad_enabled(False) @@ -819,7 +819,7 @@ def _loglikelihood_tokens( ) res.append(answer) - # Clean up GPUS + # Clean up GPUs del model_output del logits del batched_inputs @@ -852,7 +852,7 @@ def prepare_batch_logprob( hlog_warn("max_context is None, using max_length") max_context = self.max_length - # Each sample is concatenated and cut to lenght or padded to max_length + # Each sample is concatenated and cut to length or padded to max_length for orig_tokens in inputs: truncated.append(max(len(orig_tokens) - max_context, 0)) @@ -1030,7 +1030,7 @@ def _loglikelihood_single_token( ) res.append(answer) - # Clean up GPUS + # Clean up GPUs del out del batch_probs del batched_inputs diff --git a/src/lighteval/models/model_config.py b/src/lighteval/models/model_config.py index f2736e1a..b686c9bd 100644 --- a/src/lighteval/models/model_config.py +++ b/src/lighteval/models/model_config.py @@ -85,9 +85,9 @@ class BaseModelConfig: If `None`, the default value will be set to `True` for seq2seq models (e.g. T5) and `False` for causal models. model_parallel (bool, optional, defaults to False): - True/False: force to uses or not the `accelerate` library to load a large + True/False: force to use or not the `accelerate` library to load a large model across multiple devices. - Default: None which correspond to comparing the number of process with + Default: None which corresponds to comparing the number of processes with the number of GPUs. If it's smaller => model-parallelism, else not. dtype (Union[str, torch.dtype], optional, defaults to None):): Converts the model weights to `dtype`, if specified. Strings get @@ -277,11 +277,8 @@ def create_model_config(args: Namespace, accelerator: Union["Accelerator", None] return BaseModelConfig(**args_dict) - if hasattr(args, "model_config") and args.model_config: - config = args.model_config["model"] - else: - with open(args.model_config_path, "r") as f: - config = yaml.safe_load(f)["model"] + with open(args.model_config_path, "r") as f: + config = yaml.safe_load(f)["model"] if config["type"] == "tgi": return TGIModelConfig( diff --git a/src/lighteval/models/model_loader.py b/src/lighteval/models/model_loader.py index dd55b424..e662beac 100644 --- a/src/lighteval/models/model_loader.py +++ b/src/lighteval/models/model_loader.py @@ -57,8 +57,8 @@ def load_model( # noqa: C901 config: Union[BaseModelConfig, AdapterModelConfig, DeltaModelConfig, TGIModelConfig, InferenceEndpointModelConfig], env_config: EnvConfig, ) -> Tuple[Union[BaseModel, AdapterModel, DeltaModel, ModelClient], ModelInfo]: - """Will load either a model from an inference server or a model from a checkpoint. depending - on the arguments passed to the program. + """Will load either a model from an inference server or a model from a checkpoint, depending + on the config type. Args: args (Namespace): arguments passed to the program diff --git a/src/lighteval/models/nanotron_model.py b/src/lighteval/models/nanotron_model.py index 977b2b19..efe20709 100644 --- a/src/lighteval/models/nanotron_model.py +++ b/src/lighteval/models/nanotron_model.py @@ -846,7 +846,7 @@ def _loglikelihood_single_token( tq.desc = f"loglikelihood_single_token Subset {s} Node {dist.get_rank(self.parallel_context.world_pg)} - {human_format(tokens_per_sec)} tokens/s" - # Clean up GPUS + # Clean up GPUs del out del batch_probs del batched_inputs @@ -1083,7 +1083,7 @@ def _loglikelihood_tokens( tokens_per_sec = batched_inputs.numel() / (elapsed_time_per_iteration_ms / 1000) tq.desc = f"loglikelihood Subset {s} Node {dist.get_rank(self.parallel_context.world_pg)} - {human_format(tokens_per_sec)} tokens/s" - # Clean up GPUS + # Clean up GPUs del out del logits del batched_inputs diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py index 85f4e025..33934caa 100644 --- a/src/lighteval/tasks/lighteval_task.py +++ b/src/lighteval/tasks/lighteval_task.py @@ -26,7 +26,7 @@ from dataclasses import dataclass from multiprocessing import Pool from pathlib import Path -from typing import TYPE_CHECKING, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union from datasets import load_dataset @@ -454,7 +454,7 @@ def get_request_type(self) -> list[RequestType]: # noqa C901 def construct_requests( self, formatted_doc: Doc, context: str, document_id_seed: str, current_task_name: str - ) -> List[Request]: + ) -> Dict[RequestType, List[Request]]: """ Constructs a list of requests from the task based on the given parameters. diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py index abaa1745..df5e4da6 100644 --- a/src/lighteval/tasks/registry.py +++ b/src/lighteval/tasks/registry.py @@ -117,7 +117,7 @@ def get_task_dict( Args: task_name_list (List[str]): A list of task names. - custom_tasks (Optional[Union[str, ModuleType]]): Path to the custom tasks file or name of a module to import containing custom tasks or the module it-self + custom_tasks (Optional[Union[str, ModuleType]]): Path to the custom tasks file or name of a module to import containing custom tasks or the module itself extended_tasks (Optional[str]): The path to the extended tasks group of submodules Returns: @@ -159,7 +159,7 @@ def create_custom_tasks_module(custom_tasks: Union[str, ModuleType]) -> ModuleTy """Creates a custom task module to load tasks defined by the user in their own file. Args: - custom_tasks (Optional[Union[str, ModuleType]]): Path to the custom tasks file or name of a module to import containing custom tasks or the module it-self + custom_tasks (Optional[Union[str, ModuleType]]): Path to the custom tasks file or name of a module to import containing custom tasks or the module itself Returns: ModuleType: The newly imported/created custom tasks modules @@ -178,7 +178,7 @@ def get_custom_tasks(custom_tasks: Union[str, ModuleType]) -> Tuple[ModuleType, """Get all the custom tasks available from the given custom tasks file or module. Args: - custom_tasks (Optional[Union[str, ModuleType]]): Path to the custom tasks file or name of a module to import containing custom tasks or the module it-self + custom_tasks (Optional[Union[str, ModuleType]]): Path to the custom tasks file or name of a module to import containing custom tasks or the module itself """ custom_tasks_module = create_custom_tasks_module(custom_tasks=custom_tasks) tasks_string = "" diff --git a/src/lighteval/tasks/requests.py b/src/lighteval/tasks/requests.py index 283e6959..6dd30786 100644 --- a/src/lighteval/tasks/requests.py +++ b/src/lighteval/tasks/requests.py @@ -143,7 +143,7 @@ class TaskExampleId(NamedTuple): Represents the identifier for an example in a task. Attributes: - task_name (str): The name of the task. + task_name (str): The name of the task in `name|num_fewshot` format. doc_id_seed (str): The document id with the seed used for few_shot appended at the end. """ @@ -187,9 +187,7 @@ def get_golds(self, few_shot: bool = False): choices = self.choices golds = [] for gold_ix in gold_indices: - local_golds = as_list(choices[gold_ix]) - for local_gold in local_golds: - golds.append(local_gold) + golds.extend(as_list(choices[gold_ix])) return golds def __repr__(self): From 60646959cea6ff183688a37eeae443efc1fa4584 Mon Sep 17 00:00:00 2001 From: Anton Lozhkov Date: Mon, 8 Jul 2024 08:38:15 +0200 Subject: [PATCH 19/25] [Bugfix] Avoid truncating the outputs based on string lengths (#201) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fix context size * - redundant condition --------- Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> --- src/lighteval/models/base_model.py | 37 +++++++++++------------- src/lighteval/models/nanotron_model.py | 39 ++++++++++++-------------- 2 files changed, 35 insertions(+), 41 deletions(-) diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py index df7b3e92..7f17f24d 100644 --- a/src/lighteval/models/base_model.py +++ b/src/lighteval/models/base_model.py @@ -530,27 +530,7 @@ def greedy_until( returns_logits = batch[0].use_logits num_samples = batch[0].num_samples - # The main question for this step is the following: - # Would we rather truncate the prompt to allow generation to go to max_new_tokens, at the risk - # of loosing some meaning, or have some generations that are exceedingly short? - # The choice we go for here is to avoid truncating the prompt if we can, since it - # should have been managed by the prompt creator/few shot manager if requested by the user. context = [c.context for c in batch] - smallest_context = min(len(c) for c in context) - biggest_context = max(len(c) for c in context) - if smallest_context > self.max_length: - hlog_warn( - f"The smallest context of your batch ({smallest_context}) is bigger than the maximum context size allowed by the model ({self.max_length}) for a task in" - + str({i.task_name for i in batch}) - + ". This is likely to lead to some errors." # noqa C401 - ) - - if ( - biggest_context > self.max_length - ): # There will be truncation of at least one sample, maximum generation size will be one - max_new_tokens = 1 - else: # We can't allow generation of more than max_length - max_new_tokens = min(self.max_length - biggest_context, max_new_tokens) # See doc https://huggingface.co/docs/transformers/v4.38.2/en/pad_truncation#padding-and-truncation # Will do left truncation and padding, as defined when creating the tokenizer @@ -563,6 +543,23 @@ def greedy_until( add_special_tokens=self.add_special_tokens, ).to(self.device) + # The main question for this step is the following: + # Would we rather truncate the prompt to allow generation to go to max_new_tokens, at the risk + # of losing some meaning, or have some generations that are exceedingly short? + # The choice we go for here is to avoid truncating the prompt if we can, since it + # should have been managed by the prompt creator/few shot manager if requested by the user. + context_size = tokenized["input_ids"].shape[1] + if context_size > self.max_length: + hlog_warn( + f"The context size of your batch ({context_size}) is bigger than the maximum context size allowed by the model ({self.max_length}) for a task in" + + str({i.task_name for i in batch}) + + ". This is likely to lead to some errors." # noqa C401 + ) + # There will be truncation of at least one sample, maximum generation size will be one + max_new_tokens = 1 + else: # We can't allow generation of more than max_length + max_new_tokens = min(self.max_length - context_size, max_new_tokens) + prepared_batch = Batch( input_ids=tokenized["input_ids"], input_lengths=[len(item == 1) for item in tokenized["attention_mask"]], diff --git a/src/lighteval/models/nanotron_model.py b/src/lighteval/models/nanotron_model.py index efe20709..b75bc2b2 100644 --- a/src/lighteval/models/nanotron_model.py +++ b/src/lighteval/models/nanotron_model.py @@ -1207,27 +1207,7 @@ def greedy_until( "Nonotron models does not allow sampling evaluations - this is likely to fail or provide problematic results" ) - # The main question for this step is the following: - # Would we rather truncate the prompt to allow generation to go to max_new_tokens, at the risk - # of loosing some meaning, or have some generations that are exceedingly short? - # The choice we go for here is to avoid truncating the prompt if we can, since it - # should have been managed by the prompt creator/few shot manager if requested by the user. - context = [c.context for c in batch] # or tokenized context? - smallest_context = min(len(c) for c in context) - biggest_context = max(len(c) for c in context) - if smallest_context > self.max_length: - hlog_warn( - f"The smallest context of your batch ({smallest_context}) is bigger than the maximum context size allowed by the model ({self.max_length}) for a task in" - + str({i.task_name for i in batch}) - + ". This is likely to lead to some errors." # noqa C401 - ) - - if ( - biggest_context > self.max_length - ): # There will be truncation of at least one sample, maximum generation size will be one - max_new_tokens = 1 - else: # We can't allow generation of more than max_length - max_new_tokens = min(self.max_length - biggest_context, max_new_tokens) + context = [c.context for c in batch] # See doc https://huggingface.co/docs/transformers/v4.38.2/en/pad_truncation#padding-and-truncation # Will do left truncation and padding, as defined when creating the tokenizer @@ -1240,6 +1220,23 @@ def greedy_until( add_special_tokens=self.add_special_tokens, ).to(self.device) + # The main question for this step is the following: + # Would we rather truncate the prompt to allow generation to go to max_new_tokens, at the risk + # of losing some meaning, or have some generations that are exceedingly short? + # The choice we go for here is to avoid truncating the prompt if we can, since it + # should have been managed by the prompt creator/few shot manager if requested by the user. + context_size = tokenized["input_ids"].shape[1] + if context_size > self.max_length: + hlog_warn( + f"The context size of your batch ({context_size}) is bigger than the maximum context size allowed by the model ({self.max_length}) for a task in" + + str({i.task_name for i in batch}) + + ". This is likely to lead to some errors." # noqa C401 + ) + # There will be truncation of at least one sample, maximum generation size will be one + max_new_tokens = 1 + else: # We can't allow generation of more than max_length + max_new_tokens = min(self.max_length - context_size, max_new_tokens) + batch_model = Batch( input_ids=tokenized["input_ids"], input_lengths=[len(item == 1) for item in tokenized["attention_mask"]], From 0528f297ae84255aa7019232bfa1c7af870a54f2 Mon Sep 17 00:00:00 2001 From: Sadra Barikbin Date: Mon, 8 Jul 2024 12:34:11 +0330 Subject: [PATCH 20/25] Fix the bug (#216) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> --- src/lighteval/tasks/lighteval_task.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py index 33934caa..fa70b61d 100644 --- a/src/lighteval/tasks/lighteval_task.py +++ b/src/lighteval/tasks/lighteval_task.py @@ -538,6 +538,18 @@ def construct_requests( generation_size=self.generation_size, ) ] + if self.has_metric_category[MetricCategory.LLM_AS_JUDGE]: + requests[RequestType.GREEDY_UNTIL] += [ + GreedyUntilRequest( + task_name=current_task_name, + example_index=document_id_seed, + request_index=0, + context=context, + stop_sequence=self.stop_sequence, + generation_size=self.generation_size, + num_samples=1, + ) + ] return requests From 70f7fc6d666808f9efdb0d588cdcb87185319d4a Mon Sep 17 00:00:00 2001 From: Guilherme Penedo Date: Tue, 9 Jul 2024 09:41:43 +0200 Subject: [PATCH 21/25] Adds a dummy/random model for baseline init (#220) --- README.md | 11 ++++ src/lighteval/models/dummy_model.py | 89 ++++++++++++++++++++++++++++ src/lighteval/models/model_config.py | 24 +++++++- src/lighteval/models/model_loader.py | 20 ++++++- src/lighteval/models/model_output.py | 4 +- 5 files changed, 141 insertions(+), 7 deletions(-) create mode 100644 src/lighteval/models/dummy_model.py diff --git a/README.md b/README.md index b90fc976..dc482773 100644 --- a/README.md +++ b/README.md @@ -239,6 +239,17 @@ python run_evals_accelerate.py \ --output_dir "./evals" ``` +### Using the dummy model +To debug or obtain random baseline scores for a given set of tasks, you can use the `dummy` model: +```shell +python run_evals_accelerate.py \ + --model_args "dummy"\ + --tasks \ + --output_dir output_dir +``` +This "model" randomly generates logprobs (for selection/accuracy tasks) and the string "random baseline" for generation tasks. +You can also select a specific seed for the random logprob values generated by the dummy model: `--model_args "dummy,seed=123"`. + ## Deep thanks `lighteval` was originally built on top of the great [Eleuther AI Harness](https://github.com/EleutherAI/lm-evaluation-harness) (we use the latter to power the [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)). We also took a lot of inspiration from the amazing [HELM](https://crfm.stanford.edu/helm/latest/), notably for metrics. diff --git a/src/lighteval/models/dummy_model.py b/src/lighteval/models/dummy_model.py new file mode 100644 index 00000000..08335db5 --- /dev/null +++ b/src/lighteval/models/dummy_model.py @@ -0,0 +1,89 @@ +# MIT License +# +# Copyright (c) 2024 The HuggingFace Team +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# inspired by https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/dummy.py + +import random +from typing import Optional + +from transformers import AutoTokenizer + +from lighteval.models.abstract_model import LightevalModel +from lighteval.models.model_config import DummyModelConfig, EnvConfig +from lighteval.models.model_output import GenerateReturn, LoglikelihoodReturn, LoglikelihoodSingleTokenReturn +from lighteval.tasks.requests import ( + GreedyUntilRequest, + LoglikelihoodRequest, + LoglikelihoodRollingRequest, + LoglikelihoodSingleTokenRequest, +) + + +class DummyModel(LightevalModel): + """Dummy model to generate random baselines.""" + + def __init__( + self, + config: DummyModelConfig, + env_config: EnvConfig, + ): + self.config = config + self.env_config = env_config + self._random = random.Random(self.config.seed) + self._tokenizer = None + + @property + def tokenizer(self): + if not self._tokenizer: + self._tokenizer = AutoTokenizer.from_pretrained("gpt2") + return self._tokenizer + + @property + def add_special_tokens(self): + return False + + @property + def max_length(self) -> int: + return 2048 + + def greedy_until( + self, requests: list[GreedyUntilRequest], override_bs: Optional[int] = None + ) -> list[GenerateReturn]: + return [GenerateReturn(result="random baseline") for _ in range(len(requests))] + + def loglikelihood( + self, requests: list[LoglikelihoodRequest], override_bs: Optional[int] = None + ) -> list[LoglikelihoodReturn]: + return [LoglikelihoodReturn((-self._random.random(), False)) for _ in requests] + + def loglikelihood_rolling( + self, requests: list[LoglikelihoodRollingRequest], override_bs: Optional[int] = None + ) -> list[LoglikelihoodReturn]: + return [LoglikelihoodReturn((-self._random.random(), False)) for _ in requests] + + def loglikelihood_single_token( + self, requests: list[LoglikelihoodSingleTokenRequest], override_bs: Optional[int] = None + ) -> list[LoglikelihoodSingleTokenReturn]: + return [ + LoglikelihoodSingleTokenReturn(result=[-self._random.random() for _ in req.tokenized_continuation]) + for req in requests + ] diff --git a/src/lighteval/models/model_config.py b/src/lighteval/models/model_config.py index b686c9bd..b6f4bb5d 100644 --- a/src/lighteval/models/model_config.py +++ b/src/lighteval/models/model_config.py @@ -203,6 +203,11 @@ class TGIModelConfig: model_id: str +@dataclass +class DummyModelConfig: + seed: int = 42 + + @dataclass class InferenceModelConfig: model: str @@ -253,7 +258,16 @@ def nullable_keys() -> list[str]: return ["namespace", "env_vars", "image_url"] -def create_model_config(args: Namespace, accelerator: Union["Accelerator", None]) -> BaseModelConfig: # noqa: C901 +def create_model_config( # noqa: C901 + args: Namespace, accelerator: Union["Accelerator", None] +) -> Union[ + BaseModelConfig, + AdapterModelConfig, + DeltaModelConfig, + TGIModelConfig, + InferenceEndpointModelConfig, + DummyModelConfig, +]: """ Create a model configuration based on the provided arguments. @@ -262,7 +276,7 @@ def create_model_config(args: Namespace, accelerator: Union["Accelerator", None] accelerator (Union[Accelerator, None]): accelerator to use for model training. Returns: - BaseModelConfig: model configuration. + Union[BaseModelConfig, AdapterModelConfig, DeltaModelConfig, TGIModelConfig, InferenceEndpointModelConfig, DummyModelConfig]: model configuration. Raises: ValueError: If both an inference server address and model arguments are provided. @@ -271,7 +285,11 @@ def create_model_config(args: Namespace, accelerator: Union["Accelerator", None] ValueError: If a base model is specified when not using delta weights or adapter weights. """ if args.model_args: - args_dict = {k.split("=")[0]: k.split("=")[1] for k in args.model_args.split(",")} + args_dict = {k.split("=")[0]: k.split("=")[1] if "=" in k else True for k in args.model_args.split(",")} + + if args_dict.pop("dummy", False): + return DummyModelConfig(**args_dict) + args_dict["accelerator"] = accelerator args_dict["use_chat_template"] = args.use_chat_template diff --git a/src/lighteval/models/model_loader.py b/src/lighteval/models/model_loader.py index e662beac..c72d6403 100644 --- a/src/lighteval/models/model_loader.py +++ b/src/lighteval/models/model_loader.py @@ -27,11 +27,13 @@ from lighteval.models.adapter_model import AdapterModel from lighteval.models.base_model import BaseModel from lighteval.models.delta_model import DeltaModel +from lighteval.models.dummy_model import DummyModel from lighteval.models.endpoint_model import InferenceEndpointModel from lighteval.models.model_config import ( AdapterModelConfig, BaseModelConfig, DeltaModelConfig, + DummyModelConfig, EnvConfig, InferenceEndpointModelConfig, InferenceModelConfig, @@ -54,9 +56,16 @@ class ModelInfo: def load_model( # noqa: C901 - config: Union[BaseModelConfig, AdapterModelConfig, DeltaModelConfig, TGIModelConfig, InferenceEndpointModelConfig], + config: Union[ + BaseModelConfig, + AdapterModelConfig, + DeltaModelConfig, + TGIModelConfig, + InferenceEndpointModelConfig, + DummyModelConfig, + ], env_config: EnvConfig, -) -> Tuple[Union[BaseModel, AdapterModel, DeltaModel, ModelClient], ModelInfo]: +) -> Tuple[Union[BaseModel, AdapterModel, DeltaModel, ModelClient, DummyModel], ModelInfo]: """Will load either a model from an inference server or a model from a checkpoint, depending on the config type. @@ -82,6 +91,9 @@ def load_model( # noqa: C901 if isinstance(config, BaseModelConfig): return load_model_with_accelerate_or_default(config=config, env_config=env_config) + if isinstance(config, DummyModelConfig): + return load_dummy_model(config=config, env_config=env_config) + def load_model_with_tgi(config: TGIModelConfig): if not is_tgi_available(): @@ -143,3 +155,7 @@ def load_model_with_accelerate_or_default( hlog(f"Model info: {model_info}") return model, model_info + + +def load_dummy_model(config: DummyModelConfig, env_config: EnvConfig): + return DummyModel(config=config, env_config=env_config), ModelInfo(model_name="dummy", model_sha=str(config.seed)) diff --git a/src/lighteval/models/model_output.py b/src/lighteval/models/model_output.py index 51027858..ce85c020 100644 --- a/src/lighteval/models/model_output.py +++ b/src/lighteval/models/model_output.py @@ -31,8 +31,8 @@ class ModelReturn: result: Union[tuple, list, str] input_tokens: list[int] = field(default_factory=list) # model inputs generated_tokens: list[int] = field(default_factory=list) # model generations - truncated_tokens_count: Optional[int] = None # How many tokens truncated - padded_tokens_count: Optional[int] = None # How many tokens of padding + truncated_tokens_count: Optional[int] = 0 # How many tokens truncated + padded_tokens_count: Optional[int] = 0 # How many tokens of padding def get_result_for_eval(self): raise NotImplementedError() From ac57b78e7b34cf41ae93d1a9f8fae1b23f52ffe5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Fourrier?= <22726840+clefourrier@users.noreply.github.com> Date: Tue, 9 Jul 2024 12:13:01 +0200 Subject: [PATCH 22/25] Homogeneize logging system (#150) --------- Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com> Co-authored-by: Nathan Habib --- README.md | 2 +- pyproject.toml | 3 +- run_evals_accelerate.py | 1 + src/lighteval/logging/evaluation_tracker.py | 230 +++++++++----------- src/lighteval/main_accelerate.py | 14 +- src/lighteval/main_nanotron.py | 8 +- src/lighteval/utils.py | 9 + 7 files changed, 136 insertions(+), 131 deletions(-) diff --git a/README.md b/README.md index dc482773..8c6f1063 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ Install the dependencies. For the default installation, you just need: pip install . ``` -If you want to evaluate models with frameworks like `accelerate` or `peft`, you will need to specify the optional dependencies group that fits your use case (`accelerate`,`tgi`,`optimum`,`quantization`,`adapters`,`nanotron`): +If you want to evaluate models with frameworks like `accelerate` or `peft`, you will need to specify the optional dependencies group that fits your use case (`accelerate`,`tgi`,`optimum`,`quantization`,`adapters`,`nanotron`,`tensorboardX`): ```bash pip install '.[optional1,optional2]' diff --git a/pyproject.toml b/pyproject.toml index a9fe4bc7..b771942d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,7 +55,7 @@ keywords = ["evaluation", "nlp", "llm"] dependencies = [ # Base dependencies "transformers>=4.38.0", - "huggingface_hub>=0.22.0", + "huggingface_hub>=0.23.0", "torch>=2.0", "GitPython>=3.1.41", # for logging "datasets>=2.14.0", @@ -86,6 +86,7 @@ nanotron = [ "nanotron", "tensorboardX" ] +tensorboardX = ["tensorboardX"] quality = ["ruff==v0.2.2","pre-commit"] tests = ["pytest==7.4.0"] dev = ["lighteval[accelerate,quality,tests]"] diff --git a/run_evals_accelerate.py b/run_evals_accelerate.py index 20b6ec9f..d623de25 100644 --- a/run_evals_accelerate.py +++ b/run_evals_accelerate.py @@ -48,6 +48,7 @@ def get_parser(): parser.add_argument("--push_results_to_hub", default=False, action="store_true") parser.add_argument("--save_details", action="store_true") parser.add_argument("--push_details_to_hub", default=False, action="store_true") + parser.add_argument("--push_results_to_tensorboard", default=False, action="store_true") parser.add_argument( "--public_run", default=False, action="store_true", help="Push results and details to a public repo" ) diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py index f4bdf956..b1dbe616 100644 --- a/src/lighteval/logging/evaluation_tracker.py +++ b/src/lighteval/logging/evaluation_tracker.py @@ -41,11 +41,11 @@ TaskConfigLogger, VersionsLogger, ) -from lighteval.utils import is_nanotron_available, obj_to_markdown +from lighteval.utils import NO_TENSORBOARDX_WARN_MSG, is_nanotron_available, is_tensorboardX_available, obj_to_markdown if is_nanotron_available(): - from nanotron.config import Config + from nanotron.config import GeneralArgs class EnhancedJSONEncoder(json.JSONEncoder): @@ -80,56 +80,74 @@ class EvaluationTracker: task_config_logger: TaskConfigLogger hub_results_org: str - def __init__(self, hub_results_org: str = "", token: str = "") -> None: - """ + def __init__( + self, + output_dir: str = None, + hub_results_org: str = "", + push_results_to_hub: bool = False, + push_details_to_hub: bool = False, + push_results_to_tensorboard: bool = False, + tensorboard_metric_prefix: str = "eval", + public: bool = False, + token: str = "", + nanotron_run_info: "GeneralArgs" = None, + ) -> None: + """) Creates all the necessary loggers for evaluation tracking. Args: + output_dir (str): Local folder path where you want results to be saved hub_results_org (str): The organisation to push the results to. See more details about the datasets organisation in [`EvaluationTracker.save`] + push_results_to_hub (bool): If True, results are pushed to the hub. + Results will be pushed either to `{hub_results_org}/results`, a public dataset, if `public` is True else to `{hub_results_org}/private-results`, a private dataset. + push_details_to_hub (bool): If True, details are pushed to the hub. + Results are pushed to `{hub_results_org}/details__{sanitized model_name}` for the model `model_name`, a public dataset, + if `public` is True else `{hub_results_org}/details__{sanitized model_name}_private`, a private dataset. + push_results_to_tensorboard (bool): If True, will create and push the results for a tensorboard folder on the hub + public (bool): If True, results and details are pushed in private orgs token (str): Token to use when pushing to the hub. This token should have write access to `hub_results_org`. + nanotron_run_info (GeneralArgs): Reference to informations about Nanotron models runs """ self.details_logger = DetailsLogger() self.metrics_logger = MetricsLogger() self.versions_logger = VersionsLogger() self.general_config_logger = GeneralConfigLogger() self.task_config_logger = TaskConfigLogger() - self.hub_results_org = hub_results_org - self.hub_results_repo = f"{hub_results_org}/results" - self.hub_private_results_repo = f"{hub_results_org}/private-results" + self.api = HfApi(token=token) - def save( - self, - output_dir: str, - push_results_to_hub: bool, - push_details_to_hub: bool, - public: bool, - push_results_to_tensorboard: bool = False, - ) -> None: - """Saves the experiment information and results to files, and to the hub if requested. + self.output_dir = output_dir - Note: - In case of save failure, this function will only print a warning, with the error message. + self.hub_results_org = hub_results_org # will also contain tensorboard results + if hub_results_org in ["", None] and any( + [push_details_to_hub, push_results_to_hub, push_results_to_tensorboard] + ): + raise Exception( + "You need to select which org to push to, using `--results_org`, if you want to save information to the hub." + ) - Args: - output_dir (str): Local folder path where you want results to be saved - push_results_to_hub (bool): If True, results are pushed to the hub. - Results will be pushed either to `{hub_results_org}/results`, a public dataset, if `public` is True else to `{hub_results_org}/private-results`, a private dataset. - push_details_to_hub (bool): If True, details are pushed to the hub. - Results are pushed to `{hub_results_org}/details__{sanitized model_name}` for the model `model_name`, a public dataset, - if `public` is True else `{hub_results_org}/details__{sanitized model_name}_private`, a private dataset. - public (bool): If True, results and details are pushed in private orgs + self.hub_results_repo = f"{hub_results_org}/results" + self.hub_private_results_repo = f"{hub_results_org}/private-results" + self.push_results_to_hub = push_results_to_hub + self.push_details_to_hub = push_details_to_hub - """ + self.push_results_to_tensorboard = push_results_to_tensorboard + self.tensorboard_repo = f"{hub_results_org}/tensorboard_logs" + self.tensorboard_metric_prefix = tensorboard_metric_prefix + self.nanotron_run_info = nanotron_run_info + + self.public = public + + def save(self) -> None: + """Saves the experiment information and results to files, and to the hub if requested.""" hlog("Saving experiment tracker") - # try: date_id = datetime.now().isoformat().replace(":", "-") - output_dir_results = Path(output_dir) / "results" / self.general_config_logger.model_name - output_dir_details = Path(output_dir) / "details" / self.general_config_logger.model_name + output_dir_results = Path(self.output_dir) / "results" / self.general_config_logger.model_name + output_dir_details = Path(self.output_dir) / "details" / self.general_config_logger.model_name output_dir_details_sub_folder = output_dir_details / date_id output_dir_results.mkdir(parents=True, exist_ok=True) output_dir_details_sub_folder.mkdir(parents=True, exist_ok=True) @@ -140,9 +158,6 @@ def save( hlog(f"Saving results to {output_results_file} and {output_results_in_details_file}") config_general = copy.deepcopy(self.general_config_logger) - config_general.config = ( - config_general.config.as_dict() if is_dataclass(config_general.config) else config_general.config - ) config_general = asdict(config_general) to_dump = { @@ -163,14 +178,8 @@ def save( for task_name, task_details in self.details_logger.details.items(): output_file_details = output_dir_details_sub_folder / f"details_{task_name}_{date_id}.parquet" - # Create a dataset from the dictionary - try: - dataset = Dataset.from_list([asdict(detail) for detail in task_details]) - except Exception: - # We force cast to str to avoid formatting problems for nested objects - dataset = Dataset.from_list( - [{k: str(v) for k, v in asdict(detail).items()} for detail in task_details] - ) + # Create a dataset from the dictionary - we force cast to str to avoid formatting problems for nested objects + dataset = Dataset.from_list([{k: str(v) for k, v in asdict(detail).items()} for detail in task_details]) # We don't keep 'id' around if it's there column_names = dataset.column_names @@ -182,30 +191,25 @@ def save( # Save the dataset to a Parquet file dataset.to_parquet(output_file_details.as_posix()) - if push_results_to_hub: + if self.push_results_to_hub: self.api.upload_folder( - repo_id=self.hub_results_repo if public else self.hub_private_results_repo, + repo_id=self.hub_results_repo if self.public else self.hub_private_results_repo, folder_path=output_dir_results, path_in_repo=self.general_config_logger.model_name, repo_type="dataset", commit_message=f"Updating model {self.general_config_logger.model_name}", ) - if push_details_to_hub: + if self.push_details_to_hub: self.details_to_hub( - model_name=self.general_config_logger.model_name, results_file_path=output_results_in_details_file, details_folder_path=output_dir_details_sub_folder, - push_as_public=public, ) - if push_results_to_tensorboard: - self.push_results_to_tensorboard( + if self.push_results_to_tensorboard: + self.push_to_tensorboard( results=self.metrics_logger.metric_aggregated, details=self.details_logger.details ) - # except Exception as e: - # hlog("WARNING: Could not save results") - # hlog(repr(e)) def generate_final_dict(self) -> dict: """Aggregates and returns all the logger's experiment information in a dictionary. @@ -230,29 +234,25 @@ def generate_final_dict(self) -> dict: def details_to_hub( self, - model_name: str, results_file_path: Path | str, details_folder_path: Path | str, - push_as_public: bool = False, ) -> None: """Pushes the experiment details (all the model predictions for every step) to the hub. Args: - model_name (str): Name of the currently evaluated model results_file_path (str or Path): Local path of the current's experiment aggregated results individual file details_folder_path (str or Path): Local path of the current's experiment details folder. The details folder (created by [`EvaluationTracker.save`]) should contain one parquet file per task used during the evaluation run of the current model. - push_as_public (bool, optional): If True, the results will be pushed publicly, else the datasets will be private. """ results_file_path = str(results_file_path) details_folder_path = str(details_folder_path) - sanitized_model_name = model_name.replace("/", "__") + sanitized_model_name = self.general_config_logger.model_name.replace("/", "__") # "Default" detail names are the public detail names (same as results vs private-results) repo_id = f"{self.hub_results_org}/details_{sanitized_model_name}" - if not push_as_public: # if not public, we add `_private` + if not self.public: # if not public, we add `_private` repo_id = f"{repo_id}_private" sub_folder_path = os.path.basename(results_file_path).replace(".json", "").replace("results_", "") @@ -265,7 +265,7 @@ def details_to_hub( if len(checked_paths) == 0: hlog(f"Repo {repo_id} not found for {results_file_path}. Creating it.") - self.api.create_repo(repo_id, private=not (push_as_public), repo_type="dataset", exist_ok=True) + self.api.create_repo(repo_id, private=not (self.public), repo_type="dataset", exist_ok=True) # Create parquet version of results file as well results = load_dataset("json", data_files=results_file_path) @@ -287,43 +287,45 @@ def details_to_hub( repo_id=repo_id, folder_path=details_folder_path, path_in_repo=sub_folder_path, repo_type="dataset" ) - self.recreate_metadata_card(repo_id, model_name) + self.recreate_metadata_card(repo_id) - def recreate_metadata_card(self, repo_id: str, model_name: str = None) -> None: # noqa: C901 + def recreate_metadata_card(self, repo_id: str) -> None: # noqa: C901 """Fully updates the details repository metadata card for the currently evaluated model Args: repo_id (str): Details dataset repository path on the hub (`org/dataset`) - model_name (str): Name of the currently evaluated model. - """ # Add a nice dataset card and the configuration YAML files_in_repo = self.api.list_repo_files(repo_id=repo_id, repo_type="dataset") results_files = [f for f in files_in_repo if ".json" in f] - parquet_results_files = [f for f in files_in_repo if ".parquet" in f and "results_" in f] - parquet_files = [f for f in files_in_repo if ".parquet" in f and "results_" not in f] + parquet_files = [f for f in files_in_repo if ".parquet" in f] multiple_results = len(results_files) > 1 # Get last eval results date for each task (evals might be non overlapping) last_eval_date_results = {} for sub_file in parquet_files: + # We focus on details only + if "results_" in sub_file: + continue + # subfile have this general format: # `2023-09-03T10-57-04.203304/details_harness|hendrycksTest-us_foreign_policy|5_2023-09-03T10-57-04.203304.parquet` # in the iso date, the `:` are replaced by `-` because windows does not allow `:` in their filenames - - task_name = os.path.basename(sub_file).replace("details_", "").split("_2023")[0].split("_2024")[0] + task_name = ( + os.path.basename(sub_file).replace("details_", "").split("_202")[0] + ) # 202 for dates, 2023, 2024, ... # task_name is then equal to `leaderboard|mmlu:us_foreign_policy|5` - iso_date = os.path.dirname(sub_file) # to be able to parse the filename as iso dates, we need to re-replace the `-` with `:` # iso_date[13] = iso_date[16] = ':' - iso_date = iso_date[:13] + ":" + iso_date[14:16] + ":" + iso_date[17:] - + dir_name = os.path.dirname(sub_file) + iso_date = ":".join(dir_name.rsplit("-", 2)) eval_date = datetime.fromisoformat(iso_date) last_eval_date_results[task_name] = ( max(last_eval_date_results[task_name], eval_date) if task_name in last_eval_date_results else eval_date ) + max_last_eval_date_results = list(last_eval_date_results.values())[0] # Now we convert them in iso-format for task in last_eval_date_results: @@ -336,43 +338,20 @@ def recreate_metadata_card(self, repo_id: str, model_name: str = None) -> None: card_metadata = MetadataConfigs() # Add the results config and add the result file as a parquet file - for sub_file in parquet_results_files: - eval_date = os.path.basename(sub_file).replace("results_", "").replace(".parquet", "") - sanitized_eval_date = re.sub(r"[^\w\.]", "_", eval_date) - sanitized_last_eval_date_results = re.sub(r"[^\w\.]", "_", max_last_eval_date_results) - - repo_file_name = os.path.basename(sub_file) - - if multiple_results: - if "results" not in card_metadata: - card_metadata["results"] = { - "data_files": [{"split": sanitized_eval_date, "path": [repo_file_name]}] - } - else: - former_entry = card_metadata["results"] - card_metadata["results"] = { - "data_files": former_entry["data_files"] - + [{"split": sanitized_eval_date, "path": [repo_file_name]}] - } + for sub_file in parquet_files: + if "results_" in sub_file: + eval_date = os.path.basename(sub_file).replace("results_", "").replace(".parquet", "") + sanitized_task = "results" + sanitized_last_eval_date_results = re.sub(r"[^\w\.]", "_", max_last_eval_date_results) + repo_file_name = os.path.basename(sub_file) else: - if "results" in card_metadata: - raise ValueError( - f"Entry for results already exists in {former_entry} for repo {repo_id} and file {sub_file}" - ) - card_metadata["results"] = {"data_files": [{"split": sanitized_eval_date, "path": [repo_file_name]}]} + task_name = os.path.basename(sub_file).replace("details_", "").split("_2023")[0].split("_2024")[0] + sanitized_task = re.sub(r"\W", "_", task_name) + eval_date = os.path.dirname(sub_file) + sanitized_last_eval_date_results = re.sub(r"[^\w\.]", "_", last_eval_date_results[task_name]) + repo_file_name = os.path.join("**", os.path.basename(sub_file)) - if sanitized_eval_date == sanitized_last_eval_date_results: - all_entry = card_metadata["results"]["data_files"] - card_metadata["results"] = {"data_files": all_entry + [{"split": "latest", "path": [repo_file_name]}]} - - # Add the tasks details configs - for sub_file in parquet_files: - task_name = os.path.basename(sub_file).replace("details_", "").split("_2023")[0].split("_2024")[0] - sanitized_task = re.sub(r"\W", "_", task_name) - eval_date = os.path.dirname(sub_file) sanitized_eval_date = re.sub(r"[^\w\.]", "_", eval_date) - repo_file_name = os.path.join("**", os.path.basename(sub_file)) - sanitized_last_eval_date_results = re.sub(r"[^\w\.]", "_", last_eval_date_results[task_name]) if multiple_results: if sanitized_task not in card_metadata: @@ -400,6 +379,9 @@ def recreate_metadata_card(self, repo_id: str, model_name: str = None) -> None: "data_files": all_entry + [{"split": "latest", "path": [repo_file_name]}] } + if "results_" in sub_file: + continue + # Special case for MMLU with a single split covering it all # We add another config with all MMLU splits results together for easy inspection SPECIAL_TASKS = [ @@ -481,7 +463,7 @@ def recreate_metadata_card(self, repo_id: str, model_name: str = None) -> None: card_data = DatasetCardData( dataset_summary=f"Dataset automatically created during the evaluation run of model " - f"[{model_name}](https://huggingface.co/{model_name})" + f"[{self.general_config_logger.model_name}](https://huggingface.co/{self.general_config_logger.model_name})" f"{org_string}.\n\n" f"The dataset is composed of {len(card_metadata) - 1} configuration, each one coresponding to one of the evaluated task.\n\n" f"The dataset has been created from {len(results_files)} run(s). Each run can be found as a specific split in each " @@ -494,8 +476,8 @@ def recreate_metadata_card(self, repo_id: str, model_name: str = None) -> None: f"(note that their might be results for other tasks in the repos if successive evals didn't cover the same tasks. " f'You find each in the results and the "latest" split for each eval):\n\n' f"```python\n{results_string}\n```", - repo_url=f"https://huggingface.co/{model_name}", - pretty_name=f"Evaluation run of {model_name}", + repo_url=f"https://huggingface.co/{self.general_config_logger.model_name}", + pretty_name=f"Evaluation run of {self.general_config_logger.model_name}", leaderboard_url=leaderboard_url, point_of_contact=point_of_contact, ) @@ -507,27 +489,30 @@ def recreate_metadata_card(self, repo_id: str, model_name: str = None) -> None: ) card.push_to_hub(repo_id, repo_type="dataset") - def push_results_to_tensorboard( # noqa: C901 + def push_to_tensorboard( # noqa: C901 self, results: dict[str, dict[str, float]], details: dict[str, DetailsLogger.CompiledDetail] ): + if not is_tensorboardX_available: + hlog_warn(NO_TENSORBOARDX_WARN_MSG) + return + if not is_nanotron_available(): hlog_warn("You cannot push results to tensorboard without having nanotron installed. Skipping") return - config: Config = self.general_config_logger.config - lighteval_config = config.lighteval - try: - global_step = config.general.step - except ValueError: - global_step = 0 - if config.lighteval.logging.tensorboard_metric_prefix is not None: - prefix = config.lighteval.logging.tensorboard_metric_prefix + prefix = self.tensorboard_metric_prefix + + if self.nanotron_run_info is not None: + global_step = self.nanotron_run_info.step + run = f"{self.nanotron_run_info.run}_{prefix}" else: - prefix = "eval" - output_dir_tb = Path(lighteval_config.logging.local_output_path) / "tb" / (config.general.run + "_" + prefix) + global_step = 0 + run = prefix + + output_dir_tb = Path(self.output_dir) / "tb" / run output_dir_tb.mkdir(parents=True, exist_ok=True) tb_context = HFSummaryWriter( logdir=str(output_dir_tb), - repo_id=lighteval_config.logging.hub_repo_tensorboard, + repo_id=self.tensorboard_repo, repo_private=True, path_in_repo="tb", commit_every=6000, # Very long time so that we can change our files names and trigger push ourselves (see below) @@ -559,14 +544,13 @@ def push_results_to_tensorboard( # noqa: C901 ) else: tb_context.add_scalar(f"{prefix}/{task_name}/{metric}", value, global_step=global_step) - # e.g. MMLU + # Tasks with subtasks for name, values in bench_averages.items(): for metric, values in values.items(): hlog(f"Pushing average {name} {metric} {sum(values) / len(values)} to tensorboard") tb_context.add_scalar(f"{prefix}/{name}/{metric}", sum(values) / len(values), global_step=global_step) tb_context.add_text("eval_config", obj_to_markdown(results), global_step=global_step) - # tb_context.add_text("eval_sizes", obj_to_markdown(sizes), global_step=global_step) for task_name, task_details in details.items(): tb_context.add_text( @@ -589,8 +573,6 @@ def push_results_to_tensorboard( # noqa: C901 # Now we can push to the hub tb_context.scheduler.trigger() hlog( - f"Pushed to tensorboard at https://huggingface.co/tensorboard/{lighteval_config.logging.hub_repo_tensorboard}/" - f" at {output_dir_tb} and global_step {global_step}" + f"Pushed to tensorboard at https://huggingface.co/{self.tensorboard_repo}/{output_dir_tb}/tensorboard" + f"at global_step {global_step}" ) - # except Exception as e: - # logger.warning(f"Could not push to tensorboard\n{e}") diff --git a/src/lighteval/main_accelerate.py b/src/lighteval/main_accelerate.py index d2ffbbe3..12122c52 100644 --- a/src/lighteval/main_accelerate.py +++ b/src/lighteval/main_accelerate.py @@ -56,7 +56,15 @@ @htrack() def main(args): env_config = EnvConfig(token=TOKEN, cache_dir=args.cache_dir) - evaluation_tracker = EvaluationTracker(hub_results_org=args.results_org, token=TOKEN) + evaluation_tracker = EvaluationTracker( + output_dir=args.output_dir, + hub_results_org=args.results_org, + push_results_to_hub=args.push_results_to_hub, + push_details_to_hub=args.push_details_to_hub, + push_results_to_tensorboard=args.push_results_to_tensorboard, + public=args.public_run, + token=TOKEN, + ) evaluation_tracker.general_config_logger.log_args_info( args.num_fewshot_seeds, args.override_batch_size, args.max_samples, args.job_id ) @@ -124,9 +132,7 @@ def main(args): evaluation_tracker.details_logger.aggregate() if args.output_dir: - evaluation_tracker.save( - args.output_dir, args.push_results_to_hub, args.push_details_to_hub, args.public_run - ) + evaluation_tracker.save() final_dict = evaluation_tracker.generate_final_dict() diff --git a/src/lighteval/main_nanotron.py b/src/lighteval/main_nanotron.py index 4610ea86..f479c5d7 100644 --- a/src/lighteval/main_nanotron.py +++ b/src/lighteval/main_nanotron.py @@ -96,7 +96,13 @@ def main( data_parallel_size=lighteval_config.parallelism.dp, ) - evaluation_tracker = EvaluationTracker(token=TOKEN) + evaluation_tracker = EvaluationTracker( + token=TOKEN, + output_dir=lighteval_config.logging.local_output_path, + hub_results_org=lighteval_config.logging.hub_repo_tensorboard, + tensorboard_metric_prefix=lighteval_config.logging.tensorboard_metric_prefix, + nanotron_run_info=nanotron_config.general, + ) evaluation_tracker.general_config_logger.log_args_info( num_fewshot_seeds=1, override_batch_size=None, diff --git a/src/lighteval/utils.py b/src/lighteval/utils.py index 768a1cd8..16235785 100644 --- a/src/lighteval/utils.py +++ b/src/lighteval/utils.py @@ -191,6 +191,15 @@ def is_peft_available() -> bool: NO_PEFT_ERROR_MSG = "You are trying to use adapter weights models, for which you need `peft`, which is not available in your environment. Please install it using pip." +def is_tensorboardX_available() -> bool: + return importlib.util.find_spec("tensorboardX") is not None + + +NO_TENSORBOARDX_WARN_MSG = ( + "You are trying to log using tensorboardX, which is not installed. Please install it using pip. Skipping." +) + + def is_openai_available() -> bool: return importlib.util.find_spec("openai") is not None From 3f9095097cccaeff0dc3455a789011a9265d7782 Mon Sep 17 00:00:00 2001 From: Sadra Barikbin Date: Tue, 9 Jul 2024 15:12:50 +0330 Subject: [PATCH 23/25] Fix a few typos in `metrics.py` (#218) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --------- Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com> Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> --- src/lighteval/metrics/metrics.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py index 262b20a0..3c83625c 100644 --- a/src/lighteval/metrics/metrics.py +++ b/src/lighteval/metrics/metrics.py @@ -301,7 +301,7 @@ class Metrics(Enum): sample_level_fn=LoglikelihoodPreparator().prepare, category=MetricCategory.MULTICHOICE, use_case=MetricUseCase.ACCURACY, - corpus_level_fn=CorpusLevelF1Score(None), + corpus_level_fn=CorpusLevelF1Score(None).compute, higher_is_better=True, ) loglikelihood_f1_single_token = CorpusLevelMetric( @@ -309,7 +309,7 @@ class Metrics(Enum): sample_level_fn=LoglikelihoodPreparator(is_single_token=True).prepare, category=MetricCategory.MULTICHOICE_ONE_TOKEN, use_case=MetricUseCase.ACCURACY, - corpus_level_fn=CorpusLevelF1Score(None), + corpus_level_fn=CorpusLevelF1Score(None).compute, higher_is_better=True, ) mcc = CorpusLevelMetric( @@ -385,7 +385,7 @@ class Metrics(Enum): sample_level_fn=LoglikelihoodPreparator(is_single_token=True).prepare, category=MetricCategory.MULTICHOICE_ONE_TOKEN, use_case=MetricUseCase.ACCURACY, - corpus_level_fn=CorpusLevelF1Score(average=None, num_classes=3), + corpus_level_fn=CorpusLevelF1Score(average=None, num_classes=3).compute, higher_is_better=True, ) perfect_exact_match = SampleLevelMetric( From 3aaec227c9ae9924b423f88a6c46fdb81249d215 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Fourrier?= <22726840+clefourrier@users.noreply.github.com> Date: Tue, 9 Jul 2024 14:42:07 +0200 Subject: [PATCH 24/25] Use only dataclasses for task init (#212) * replaced json tasks by python tasks --------- Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com> --- community_tasks/_template.py | 6 +- community_tasks/aimo_evals.py | 6 +- community_tasks/arabic_evals.py | 7 +- community_tasks/german_rag_evals.py | 6 +- examples/nanotron/custom_evaluation_tasks.py | 2 +- src/lighteval/tasks/default_tasks.py | 22665 ++++++++++++++++ src/lighteval/tasks/extended/ifeval/main.py | 4 +- src/lighteval/tasks/extended/mt_bench/main.py | 4 +- .../tasks/extended/tiny_benchmarks/main.py | 6 +- src/lighteval/tasks/lighteval_task.py | 22 +- src/lighteval/tasks/registry.py | 26 +- src/lighteval/tasks/tasks_table.jsonl | 1235 - src/lighteval/utils.py | 2 +- 13 files changed, 22692 insertions(+), 1299 deletions(-) create mode 100644 src/lighteval/tasks/default_tasks.py delete mode 100644 src/lighteval/tasks/tasks_table.jsonl diff --git a/community_tasks/_template.py b/community_tasks/_template.py index 6b52f9f4..fe0d8e1d 100644 --- a/community_tasks/_template.py +++ b/community_tasks/_template.py @@ -106,7 +106,7 @@ def prompt_fn(line, task_name: str = None): # STORE YOUR EVALS SUBSET_TASKS = [CustomSubsetTask(name=f"mytask:{subset}", hf_subset=subset) for subset in SAMPLE_SUBSETS] -_TASKS = SUBSET_TASKS + [task] +TASKS_TABLE = SUBSET_TASKS + [task] # CUSTOM METRIC IF NEEDED @@ -124,8 +124,6 @@ def prompt_fn(line, task_name: str = None): # MODULE LOGIC # You should not need to touch this # Convert to dict for lighteval -TASKS_TABLE = [task.as_dict() for task in _TASKS] - if __name__ == "__main__": - print(t["name"] for t in TASKS_TABLE) + print(t.name for t in TASKS_TABLE) print(len(TASKS_TABLE)) diff --git a/community_tasks/aimo_evals.py b/community_tasks/aimo_evals.py index 556ae663..5262a013 100644 --- a/community_tasks/aimo_evals.py +++ b/community_tasks/aimo_evals.py @@ -55,14 +55,12 @@ def aimo_prompt(line, task_name: str = None): # STORE YOUR EVALS -_TASKS = [task] +TASKS_TABLE = [task] # MODULE LOGIC # You should not need to touch this -# Convert to dict for lighteval -TASKS_TABLE = [task.as_dict() for task in _TASKS] if __name__ == "__main__": - print(t["name"] for t in TASKS_TABLE) + print(t.name for t in TASKS_TABLE) print(len(TASKS_TABLE)) diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py index 9e65bade..495c95d9 100644 --- a/community_tasks/arabic_evals.py +++ b/community_tasks/arabic_evals.py @@ -577,7 +577,7 @@ def sciq_prompt_arabic(line, task_name: str = None): ) -_TASKS = ( +TASKS_TABLE = ( ARABIC_MMLU_TASKS + ACVA_TASKS + ALGHAFA_TASKS @@ -595,9 +595,6 @@ def sciq_prompt_arabic(line, task_name: str = None): + [sciq_ar_task] ) -# Convert to dict for lighteval -TASKS_TABLE = [task.as_dict() for task in _TASKS] - if __name__ == "__main__": - print(t["name"] for t in TASKS_TABLE) + print(t.name for t in TASKS_TABLE) print(len(TASKS_TABLE)) diff --git a/community_tasks/german_rag_evals.py b/community_tasks/german_rag_evals.py index fdda9d7a..0d2c76c0 100644 --- a/community_tasks/german_rag_evals.py +++ b/community_tasks/german_rag_evals.py @@ -219,14 +219,12 @@ def prompt_fn_context_question_match(line, task_name: str = None): # STORE YOUR EVALS -_TASKS = [task1, task2, task3, task4] +TASKS_TABLE = [task1, task2, task3, task4] # MODULE LOGIC # You should not need to touch this -# Convert to dict for lighteval -TASKS_TABLE = [task.as_dict() for task in _TASKS] if __name__ == "__main__": - print(t["name"] for t in TASKS_TABLE) + print(t.name for t in TASKS_TABLE) print(len(TASKS_TABLE)) diff --git a/examples/nanotron/custom_evaluation_tasks.py b/examples/nanotron/custom_evaluation_tasks.py index cdca8385..62aa8dc4 100644 --- a/examples/nanotron/custom_evaluation_tasks.py +++ b/examples/nanotron/custom_evaluation_tasks.py @@ -679,7 +679,7 @@ def agi_eval_prompt_no_letters(line, task_name: str = None): EARLY_SIGNAL_TASKS = ",".join([t[1] for t in COMMON_SENSE_REASONING_STRING] + [t[1] for t in MMLU_STRING]) # Convert to dict for lighteval -TASKS_TABLE = [task.as_dict() for task in _TASKS] +TASKS_TABLE = _TASKS # You can have a few pre-organised groups of tasks TASKS_GROUPS = { "all": ",".join(t[1] for t in _TASKS_STRINGS), diff --git a/src/lighteval/tasks/default_tasks.py b/src/lighteval/tasks/default_tasks.py new file mode 100644 index 00000000..dbfdfe09 --- /dev/null +++ b/src/lighteval/tasks/default_tasks.py @@ -0,0 +1,22665 @@ +# MIT License + +# Copyright (c) 2024 The HuggingFace Team + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +abstract_narrative_understanding_bigbench = LightevalTaskConfig( + name="abstract_narrative_understanding", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="abstract_narrative_understanding", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +agieval_aqua_rat_lighteval = LightevalTaskConfig( + name="agieval:aqua-rat", + suite=["lighteval"], + prompt_function="agieval", + hf_repo="dmayhem93/agieval-aqua-rat", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=None, + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +agieval_gaokao_biology_lighteval = LightevalTaskConfig( + name="agieval:gaokao-biology", + suite=["lighteval"], + prompt_function="agieval", + hf_repo="dmayhem93/agieval-gaokao-biology", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=None, + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +agieval_gaokao_chemistry_lighteval = LightevalTaskConfig( + name="agieval:gaokao-chemistry", + suite=["lighteval"], + prompt_function="agieval", + hf_repo="dmayhem93/agieval-gaokao-chemistry", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=None, + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +agieval_gaokao_chinese_lighteval = LightevalTaskConfig( + name="agieval:gaokao-chinese", + suite=["lighteval"], + prompt_function="agieval", + hf_repo="dmayhem93/agieval-gaokao-chinese", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=None, + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +agieval_gaokao_english_lighteval = LightevalTaskConfig( + name="agieval:gaokao-english", + suite=["lighteval"], + prompt_function="agieval", + hf_repo="dmayhem93/agieval-gaokao-english", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=None, + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +agieval_gaokao_geography_lighteval = LightevalTaskConfig( + name="agieval:gaokao-geography", + suite=["lighteval"], + prompt_function="agieval", + hf_repo="dmayhem93/agieval-gaokao-geography", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=None, + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +agieval_gaokao_history_lighteval = LightevalTaskConfig( + name="agieval:gaokao-history", + suite=["lighteval"], + prompt_function="agieval", + hf_repo="dmayhem93/agieval-gaokao-history", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=None, + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +agieval_gaokao_mathqa_lighteval = LightevalTaskConfig( + name="agieval:gaokao-mathqa", + suite=["lighteval"], + prompt_function="agieval", + hf_repo="dmayhem93/agieval-gaokao-mathqa", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=None, + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +agieval_gaokao_physics_lighteval = LightevalTaskConfig( + name="agieval:gaokao-physics", + suite=["lighteval"], + prompt_function="agieval", + hf_repo="dmayhem93/agieval-gaokao-physics", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=None, + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +agieval_logiqa_en_lighteval = LightevalTaskConfig( + name="agieval:logiqa-en", + suite=["lighteval"], + prompt_function="agieval", + hf_repo="dmayhem93/agieval-logiqa-en", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=None, + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +agieval_logiqa_zh_lighteval = LightevalTaskConfig( + name="agieval:logiqa-zh", + suite=["lighteval"], + prompt_function="agieval", + hf_repo="dmayhem93/agieval-logiqa-zh", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=None, + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +agieval_lsat_ar_lighteval = LightevalTaskConfig( + name="agieval:lsat-ar", + suite=["lighteval"], + prompt_function="agieval", + hf_repo="dmayhem93/agieval-lsat-ar", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=None, + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +agieval_lsat_lr_lighteval = LightevalTaskConfig( + name="agieval:lsat-lr", + suite=["lighteval"], + prompt_function="agieval", + hf_repo="dmayhem93/agieval-lsat-lr", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=None, + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +agieval_lsat_rc_lighteval = LightevalTaskConfig( + name="agieval:lsat-rc", + suite=["lighteval"], + prompt_function="agieval", + hf_repo="dmayhem93/agieval-lsat-rc", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=None, + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +agieval_sat_en_lighteval = LightevalTaskConfig( + name="agieval:sat-en", + suite=["lighteval"], + prompt_function="agieval", + hf_repo="dmayhem93/agieval-sat-en", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=None, + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +agieval_sat_en_without_passage_lighteval = LightevalTaskConfig( + name="agieval:sat-en-without-passage", + suite=["lighteval"], + prompt_function="agieval", + hf_repo="dmayhem93/agieval-sat-en-without-passage", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=None, + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +agieval_sat_math_lighteval = LightevalTaskConfig( + name="agieval:sat-math", + suite=["lighteval"], + prompt_function="agieval", + hf_repo="dmayhem93/agieval-sat-math", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=None, + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +anachronisms_bigbench = LightevalTaskConfig( + name="anachronisms", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="anachronisms", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +analogical_similarity_bigbench = LightevalTaskConfig( + name="analogical_similarity", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="analogical_similarity", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +analytic_entailment_bigbench = LightevalTaskConfig( + name="analytic_entailment", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="analytic_entailment", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +anli_lighteval = LightevalTaskConfig( + name="anli", + suite=["lighteval", "anli"], + prompt_function="anli", + hf_repo="anli", + hf_subset="plain_text", + hf_avail_splits=[ + "train_r1", + "dev_r1", + "train_r2", + "dev_r2", + "train_r3", + "dev_r3", + "test_r1", + "test_r2", + "test_r3", + ], + evaluation_splits=["test_r1", "test_r2", "test_r3"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +anli_r1_lighteval = LightevalTaskConfig( + name="anli:r1", + suite=["lighteval", "anli"], + prompt_function="anli", + hf_repo="anli", + hf_subset="plain_text", + hf_avail_splits=["train_r1", "dev_r1", "test_r1"], + evaluation_splits=["test_r1"], + few_shots_split="train_r1", + few_shots_select="random_sampling_from_train", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +anli_r2_lighteval = LightevalTaskConfig( + name="anli:r2", + suite=["lighteval", "anli"], + prompt_function="anli", + hf_repo="anli", + hf_subset="plain_text", + hf_avail_splits=["train_r2", "dev_r2", "test_r2"], + evaluation_splits=["test_r2"], + few_shots_split="train_r2", + few_shots_select="random_sampling_from_train", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +anli_r3_lighteval = LightevalTaskConfig( + name="anli:r3", + suite=["lighteval", "anli"], + prompt_function="anli", + hf_repo="anli", + hf_subset="plain_text", + hf_avail_splits=["train_r3", "dev_r3", "test_r3"], + evaluation_splits=["test_r3"], + few_shots_split="train_r3", + few_shots_select="random_sampling_from_train", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +arc_c_letters_original = LightevalTaskConfig( + name="arc:c:letters", + suite=["original", "arc"], + prompt_function="arc_with_options_letters_predict", + hf_repo="ai2_arc", + hf_subset="ARC-Challenge", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +arc_c_options_original = LightevalTaskConfig( + name="arc:c:options", + suite=["original", "arc"], + prompt_function="arc_with_options", + hf_repo="ai2_arc", + hf_subset="ARC-Challenge", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +arc_c_simple_original = LightevalTaskConfig( + name="arc:c:simple", + suite=["original", "arc"], + prompt_function="arc", + hf_repo="ai2_arc", + hf_subset="ARC-Challenge", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +arc_challenge_leaderboard = LightevalTaskConfig( + name="arc:challenge", + suite=["leaderboard", "arc"], + prompt_function="arc", + hf_repo="ai2_arc", + hf_subset="ARC-Challenge", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling_from_train", + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +arc_easy_lighteval = LightevalTaskConfig( + name="arc:easy", + suite=["lighteval", "arc"], + prompt_function="arc", + hf_repo="ai2_arc", + hf_subset="ARC-Easy", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling_from_train", + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +arithmetic_1dc_lighteval = LightevalTaskConfig( + name="arithmetic:1dc", + suite=["lighteval", "arithmetic"], + prompt_function="arithmetic", + hf_repo="EleutherAI/arithmetic", + hf_subset="arithmetic_1dc", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["acc_golds_likelihood"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +arithmetic_2da_lighteval = LightevalTaskConfig( + name="arithmetic:2da", + suite=["lighteval", "arithmetic"], + prompt_function="arithmetic", + hf_repo="EleutherAI/arithmetic", + hf_subset="arithmetic_2da", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["acc_golds_likelihood"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +arithmetic_2dm_lighteval = LightevalTaskConfig( + name="arithmetic:2dm", + suite=["lighteval", "arithmetic"], + prompt_function="arithmetic", + hf_repo="EleutherAI/arithmetic", + hf_subset="arithmetic_2dm", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["acc_golds_likelihood"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +arithmetic_2ds_lighteval = LightevalTaskConfig( + name="arithmetic:2ds", + suite=["lighteval", "arithmetic"], + prompt_function="arithmetic", + hf_repo="EleutherAI/arithmetic", + hf_subset="arithmetic_2ds", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["acc_golds_likelihood"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +arithmetic_3da_lighteval = LightevalTaskConfig( + name="arithmetic:3da", + suite=["lighteval", "arithmetic"], + prompt_function="arithmetic", + hf_repo="EleutherAI/arithmetic", + hf_subset="arithmetic_3da", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["acc_golds_likelihood"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +arithmetic_3ds_lighteval = LightevalTaskConfig( + name="arithmetic:3ds", + suite=["lighteval", "arithmetic"], + prompt_function="arithmetic", + hf_repo="EleutherAI/arithmetic", + hf_subset="arithmetic_3ds", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["acc_golds_likelihood"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +arithmetic_4da_lighteval = LightevalTaskConfig( + name="arithmetic:4da", + suite=["lighteval", "arithmetic"], + prompt_function="arithmetic", + hf_repo="EleutherAI/arithmetic", + hf_subset="arithmetic_4da", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["acc_golds_likelihood"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +arithmetic_4ds_lighteval = LightevalTaskConfig( + name="arithmetic:4ds", + suite=["lighteval", "arithmetic"], + prompt_function="arithmetic", + hf_repo="EleutherAI/arithmetic", + hf_subset="arithmetic_4ds", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["acc_golds_likelihood"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +arithmetic_5da_lighteval = LightevalTaskConfig( + name="arithmetic:5da", + suite=["lighteval", "arithmetic"], + prompt_function="arithmetic", + hf_repo="EleutherAI/arithmetic", + hf_subset="arithmetic_5da", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["acc_golds_likelihood"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +arithmetic_5ds_lighteval = LightevalTaskConfig( + name="arithmetic:5ds", + suite=["lighteval", "arithmetic"], + prompt_function="arithmetic", + hf_repo="EleutherAI/arithmetic", + hf_subset="arithmetic_5ds", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["acc_golds_likelihood"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +arithmetic_bb_bigbench = LightevalTaskConfig( + name="arithmetic_bb", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="arithmetic", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +ascii_word_recognition_bigbench = LightevalTaskConfig( + name="ascii_word_recognition", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="ascii_word_recognition", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +asdiv_lighteval = LightevalTaskConfig( + name="asdiv", + suite=["lighteval"], + prompt_function="asdiv", + hf_repo="EleutherAI/asdiv", + hf_subset="asdiv", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["acc_golds_likelihood"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +authorship_verification_bigbench = LightevalTaskConfig( + name="authorship_verification", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="authorship_verification", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +auto_categorization_bigbench = LightevalTaskConfig( + name="auto_categorization", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="auto_categorization", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["bleu"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +auto_debugging_bigbench_lite = LightevalTaskConfig( + name="auto_debugging", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench_linefeed_before_and_after_query", + hf_repo="bigbench", + hf_subset="auto_debugging", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["perfect_exact_match"], + stop_sequence=None, + output_regex="[^\\.\\?\\!\\;\\n]+", + trust_dataset=True, + version=0, +) +babi_qa_helm = LightevalTaskConfig( + name="babi_qa", + suite=["helm"], + prompt_function="babi_qa", + hf_repo="facebook/babi_qa", + hf_subset="en-valid-qa1", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_causal_judgment_lighteval = LightevalTaskConfig( + name="bigbench:causal_judgment", + suite=["lighteval"], + prompt_function="bbh_lighteval", + hf_repo="lighteval/bbh", + hf_subset="causal_judgement", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_date_understanding_lighteval = LightevalTaskConfig( + name="bigbench:date_understanding", + suite=["lighteval"], + prompt_function="bbh_lighteval", + hf_repo="lighteval/bbh", + hf_subset="date_understanding", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_disambiguation_qa_lighteval = LightevalTaskConfig( + name="bigbench:disambiguation_qa", + suite=["lighteval"], + prompt_function="bbh_lighteval", + hf_repo="lighteval/bbh", + hf_subset="disambiguation_qa", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_geometric_shapes_lighteval = LightevalTaskConfig( + name="bigbench:geometric_shapes", + suite=["lighteval"], + prompt_function="bbh_lighteval", + hf_repo="lighteval/bbh", + hf_subset="geometric_shapes", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_logical_deduction_five_objects_lighteval = LightevalTaskConfig( + name="bigbench:logical_deduction_five_objects", + suite=["lighteval"], + prompt_function="bbh_lighteval", + hf_repo="lighteval/bbh", + hf_subset="logical_deduction_five_objects", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_logical_deduction_seven_objects_lighteval = LightevalTaskConfig( + name="bigbench:logical_deduction_seven_objects", + suite=["lighteval"], + prompt_function="bbh_lighteval", + hf_repo="lighteval/bbh", + hf_subset="logical_deduction_seven_objects", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_logical_deduction_three_objects_lighteval = LightevalTaskConfig( + name="bigbench:logical_deduction_three_objects", + suite=["lighteval"], + prompt_function="bbh_lighteval", + hf_repo="lighteval/bbh", + hf_subset="logical_deduction_three_objects", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_movie_recommendation_lighteval = LightevalTaskConfig( + name="bigbench:movie_recommendation", + suite=["lighteval"], + prompt_function="bbh_lighteval", + hf_repo="lighteval/bbh", + hf_subset="movie_recommendation", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_navigate_lighteval = LightevalTaskConfig( + name="bigbench:navigate", + suite=["lighteval"], + prompt_function="bbh_lighteval", + hf_repo="lighteval/bbh", + hf_subset="navigate", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_reasoning_about_colored_objects_lighteval = LightevalTaskConfig( + name="bigbench:reasoning_about_colored_objects", + suite=["lighteval"], + prompt_function="bbh_lighteval", + hf_repo="lighteval/bbh", + hf_subset="reasoning_about_colored_objects", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_ruin_names_lighteval = LightevalTaskConfig( + name="bigbench:ruin_names", + suite=["lighteval"], + prompt_function="bbh_lighteval", + hf_repo="lighteval/bbh", + hf_subset="ruin_names", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_salient_translation_error_detection_lighteval = LightevalTaskConfig( + name="bigbench:salient_translation_error_detection", + suite=["lighteval"], + prompt_function="bbh_lighteval", + hf_repo="lighteval/bbh", + hf_subset="salient_translation_error_detection", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_snarks_lighteval = LightevalTaskConfig( + name="bigbench:snarks", + suite=["lighteval"], + prompt_function="bbh_lighteval", + hf_repo="lighteval/bbh", + hf_subset="snarks", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_sports_understanding_lighteval = LightevalTaskConfig( + name="bigbench:sports_understanding", + suite=["lighteval"], + prompt_function="bbh_lighteval", + hf_repo="lighteval/bbh", + hf_subset="sports_understanding", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_temporal_sequences_lighteval = LightevalTaskConfig( + name="bigbench:temporal_sequences", + suite=["lighteval"], + prompt_function="bbh_lighteval", + hf_repo="lighteval/bbh", + hf_subset="temporal_sequences", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_tracking_shuffled_objects_five_objects_lighteval = LightevalTaskConfig( + name="bigbench:tracking_shuffled_objects_five_objects", + suite=["lighteval"], + prompt_function="bbh_lighteval", + hf_repo="lighteval/bbh", + hf_subset="tracking_shuffled_objects_five_objects", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_tracking_shuffled_objects_seven_objects_lighteval = LightevalTaskConfig( + name="bigbench:tracking_shuffled_objects_seven_objects", + suite=["lighteval"], + prompt_function="bbh_lighteval", + hf_repo="lighteval/bbh", + hf_subset="tracking_shuffled_objects_seven_objects", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_tracking_shuffled_objects_three_objects_lighteval = LightevalTaskConfig( + name="bigbench:tracking_shuffled_objects_three_objects", + suite=["lighteval"], + prompt_function="bbh_lighteval", + hf_repo="lighteval/bbh", + hf_subset="tracking_shuffled_objects_three_objects", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_causal_judgment_harness = LightevalTaskConfig( + name="bigbench:causal_judgment", + suite=["harness"], + prompt_function="bbh_harness", + hf_repo="lighteval/bbh", + hf_subset="causal_judgement", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + must_remove_duplicate_docs=True, + trust_dataset=True, + version=0, +) +bigbench_date_understanding_harness = LightevalTaskConfig( + name="bigbench:date_understanding", + suite=["harness"], + prompt_function="bbh_harness", + hf_repo="lighteval/bbh", + hf_subset="date_understanding", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + must_remove_duplicate_docs=True, + trust_dataset=True, + version=0, +) +bigbench_disambiguation_qa_harness = LightevalTaskConfig( + name="bigbench:disambiguation_qa", + suite=["harness"], + prompt_function="bbh_harness", + hf_repo="lighteval/bbh", + hf_subset="disambiguation_qa", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + must_remove_duplicate_docs=True, + trust_dataset=True, + version=0, +) +bigbench_geometric_shapes_harness = LightevalTaskConfig( + name="bigbench:geometric_shapes", + suite=["harness"], + prompt_function="bbh_harness", + hf_repo="lighteval/bbh", + hf_subset="geometric_shapes", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + must_remove_duplicate_docs=True, + trust_dataset=True, + version=0, +) +bigbench_logical_deduction_five_objects_harness = LightevalTaskConfig( + name="bigbench:logical_deduction_five_objects", + suite=["harness"], + prompt_function="bbh_harness", + hf_repo="lighteval/bbh", + hf_subset="logical_deduction_five_objects", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + must_remove_duplicate_docs=True, + trust_dataset=True, + version=0, +) +bigbench_logical_deduction_seven_objects_harness = LightevalTaskConfig( + name="bigbench:logical_deduction_seven_objects", + suite=["harness"], + prompt_function="bbh_harness", + hf_repo="lighteval/bbh", + hf_subset="logical_deduction_seven_objects", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + must_remove_duplicate_docs=True, + trust_dataset=True, + version=0, +) +bigbench_logical_deduction_three_objects_harness = LightevalTaskConfig( + name="bigbench:logical_deduction_three_objects", + suite=["harness"], + prompt_function="bbh_harness", + hf_repo="lighteval/bbh", + hf_subset="logical_deduction_three_objects", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + must_remove_duplicate_docs=True, + trust_dataset=True, + version=0, +) +bigbench_movie_recommendation_harness = LightevalTaskConfig( + name="bigbench:movie_recommendation", + suite=["harness"], + prompt_function="bbh_harness", + hf_repo="lighteval/bbh", + hf_subset="movie_recommendation", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + must_remove_duplicate_docs=True, + trust_dataset=True, + version=0, +) +bigbench_navigate_harness = LightevalTaskConfig( + name="bigbench:navigate", + suite=["harness"], + prompt_function="bbh_harness", + hf_repo="lighteval/bbh", + hf_subset="navigate", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + must_remove_duplicate_docs=True, + trust_dataset=True, + version=0, +) +bigbench_reasoning_about_colored_objects_harness = LightevalTaskConfig( + name="bigbench:reasoning_about_colored_objects", + suite=["harness"], + prompt_function="bbh_harness", + hf_repo="lighteval/bbh", + hf_subset="reasoning_about_colored_objects", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + must_remove_duplicate_docs=True, + trust_dataset=True, + version=0, +) +bigbench_ruin_names_harness = LightevalTaskConfig( + name="bigbench:ruin_names", + suite=["harness"], + prompt_function="bbh_harness", + hf_repo="lighteval/bbh", + hf_subset="ruin_names", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + must_remove_duplicate_docs=True, + trust_dataset=True, + version=0, +) +bigbench_salient_translation_error_detection_harness = LightevalTaskConfig( + name="bigbench:salient_translation_error_detection", + suite=["harness"], + prompt_function="bbh_harness", + hf_repo="lighteval/bbh", + hf_subset="salient_translation_error_detection", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + must_remove_duplicate_docs=True, + trust_dataset=True, + version=0, +) +bigbench_snarks_harness = LightevalTaskConfig( + name="bigbench:snarks", + suite=["harness"], + prompt_function="bbh_harness", + hf_repo="lighteval/bbh", + hf_subset="snarks", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + must_remove_duplicate_docs=True, + trust_dataset=True, + version=0, +) +bigbench_sports_understanding_harness = LightevalTaskConfig( + name="bigbench:sports_understanding", + suite=["harness"], + prompt_function="bbh_harness", + hf_repo="lighteval/bbh", + hf_subset="sports_understanding", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + must_remove_duplicate_docs=True, + trust_dataset=True, + version=0, +) +bigbench_temporal_sequences_harness = LightevalTaskConfig( + name="bigbench:temporal_sequences", + suite=["harness"], + prompt_function="bbh_harness", + hf_repo="lighteval/bbh", + hf_subset="temporal_sequences", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + must_remove_duplicate_docs=True, + trust_dataset=True, + version=0, +) +bigbench_tracking_shuffled_objects_five_objects_harness = LightevalTaskConfig( + name="bigbench:tracking_shuffled_objects_five_objects", + suite=["harness"], + prompt_function="bbh_harness", + hf_repo="lighteval/bbh", + hf_subset="tracking_shuffled_objects_five_objects", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + must_remove_duplicate_docs=True, + trust_dataset=True, + version=0, +) +bigbench_tracking_shuffled_objects_seven_objects_harness = LightevalTaskConfig( + name="bigbench:tracking_shuffled_objects_seven_objects", + suite=["harness"], + prompt_function="bbh_harness", + hf_repo="lighteval/bbh", + hf_subset="tracking_shuffled_objects_seven_objects", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + must_remove_duplicate_docs=True, + trust_dataset=True, + version=0, +) +bigbench_tracking_shuffled_objects_three_objects_harness = LightevalTaskConfig( + name="bigbench:tracking_shuffled_objects_three_objects", + suite=["harness"], + prompt_function="bbh_harness", + hf_repo="lighteval/bbh", + hf_subset="tracking_shuffled_objects_three_objects", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + must_remove_duplicate_docs=True, + trust_dataset=True, + version=0, +) +bbh_boolean_expressions_harness = LightevalTaskConfig( + name="bbh:boolean_expressions", + suite=["harness"], + prompt_function="bbh_boolean_expressions", + hf_repo="lukaemon/bbh", + hf_subset="boolean_expressions", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_causal_judgment_harness = LightevalTaskConfig( + name="bbh:causal_judgment", + suite=["harness"], + prompt_function="bbh_causal_judgment", + hf_repo="lukaemon/bbh", + hf_subset="causal_judgement", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_date_understanding_harness = LightevalTaskConfig( + name="bbh:date_understanding", + suite=["harness"], + prompt_function="bbh_date_understanding", + hf_repo="lukaemon/bbh", + hf_subset="date_understanding", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_disambiguation_qa_harness = LightevalTaskConfig( + name="bbh:disambiguation_qa", + suite=["harness"], + prompt_function="bbh_disambiguation_qa", + hf_repo="lukaemon/bbh", + hf_subset="disambiguation_qa", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_dyck_languages_harness = LightevalTaskConfig( + name="bbh:dyck_languages", + suite=["harness"], + prompt_function="bbh_dyck_languages", + hf_repo="lukaemon/bbh", + hf_subset="dyck_languages", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_formal_fallacies_harness = LightevalTaskConfig( + name="bbh:formal_fallacies", + suite=["harness"], + prompt_function="bbh_formal_fallacies", + hf_repo="lukaemon/bbh", + hf_subset="formal_fallacies", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_geometric_shapes_harness = LightevalTaskConfig( + name="bbh:geometric_shapes", + suite=["harness"], + prompt_function="bbh_geometric_shapes", + hf_repo="lukaemon/bbh", + hf_subset="geometric_shapes", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_hyperbaton_harness = LightevalTaskConfig( + name="bbh:hyperbaton", + suite=["harness"], + prompt_function="bbh_hyperbaton", + hf_repo="lukaemon/bbh", + hf_subset="hyperbaton", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_logical_deduction_five_objects_harness = LightevalTaskConfig( + name="bbh:logical_deduction_five_objects", + suite=["harness"], + prompt_function="bbh_logical_deduction_five_objects", + hf_repo="lukaemon/bbh", + hf_subset="logical_deduction_five_objects", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_logical_deduction_seven_objects_harness = LightevalTaskConfig( + name="bbh:logical_deduction_seven_objects", + suite=["harness"], + prompt_function="bbh_logical_deduction_seven_objects", + hf_repo="lukaemon/bbh", + hf_subset="logical_deduction_seven_objects", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_logical_deduction_three_objects_harness = LightevalTaskConfig( + name="bbh:logical_deduction_three_objects", + suite=["harness"], + prompt_function="bbh_logical_deduction_three_objects", + hf_repo="lukaemon/bbh", + hf_subset="logical_deduction_three_objects", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_movie_recommendation_harness = LightevalTaskConfig( + name="bbh:movie_recommendation", + suite=["harness"], + prompt_function="bbh_movie_recommendation", + hf_repo="lukaemon/bbh", + hf_subset="movie_recommendation", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_multistep_arithmetic_two_harness = LightevalTaskConfig( + name="bbh:multistep_arithmetic_two", + suite=["harness"], + prompt_function="bbh_multistep_arithmetic_two", + hf_repo="lukaemon/bbh", + hf_subset="multistep_arithmetic_two", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_navigate_harness = LightevalTaskConfig( + name="bbh:navigate", + suite=["harness"], + prompt_function="bbh_navigate", + hf_repo="lukaemon/bbh", + hf_subset="navigate", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_object_counting_harness = LightevalTaskConfig( + name="bbh:object_counting", + suite=["harness"], + prompt_function="bbh_object_counting", + hf_repo="lukaemon/bbh", + hf_subset="object_counting", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_penguins_in_a_table_harness = LightevalTaskConfig( + name="bbh:penguins_in_a_table", + suite=["harness"], + prompt_function="bbh_penguins_in_a_table", + hf_repo="lukaemon/bbh", + hf_subset="penguins_in_a_table", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_reasoning_about_colored_objects_harness = LightevalTaskConfig( + name="bbh:reasoning_about_colored_objects", + suite=["harness"], + prompt_function="bbh_reasoning_about_colored_objects", + hf_repo="lukaemon/bbh", + hf_subset="reasoning_about_colored_objects", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_ruin_names_harness = LightevalTaskConfig( + name="bbh:ruin_names", + suite=["harness"], + prompt_function="bbh_ruin_names", + hf_repo="lukaemon/bbh", + hf_subset="ruin_names", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_salient_translation_error_detection_harness = LightevalTaskConfig( + name="bbh:salient_translation_error_detection", + suite=["harness"], + prompt_function="bbh_salient_translation_error_detection", + hf_repo="lukaemon/bbh", + hf_subset="salient_translation_error_detection", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_snarks_harness = LightevalTaskConfig( + name="bbh:snarks", + suite=["harness"], + prompt_function="bbh_snarks", + hf_repo="lukaemon/bbh", + hf_subset="snarks", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_sports_understanding_harness = LightevalTaskConfig( + name="bbh:sports_understanding", + suite=["harness"], + prompt_function="bbh_sports_understanding", + hf_repo="lukaemon/bbh", + hf_subset="sports_understanding", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_temporal_sequences_harness = LightevalTaskConfig( + name="bbh:temporal_sequences", + suite=["harness"], + prompt_function="bbh_temporal_sequences", + hf_repo="lukaemon/bbh", + hf_subset="temporal_sequences", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_tracking_shuffled_objects_five_objects_harness = LightevalTaskConfig( + name="bbh:tracking_shuffled_objects_five_objects", + suite=["harness"], + prompt_function="bbh_tracking_shuffled_objects_five_objects", + hf_repo="lukaemon/bbh", + hf_subset="tracking_shuffled_objects_five_objects", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_tracking_shuffled_objects_seven_objects_harness = LightevalTaskConfig( + name="bbh:tracking_shuffled_objects_seven_objects", + suite=["harness"], + prompt_function="bbh_tracking_shuffled_objects_seven_objects", + hf_repo="lukaemon/bbh", + hf_subset="tracking_shuffled_objects_seven_objects", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_tracking_shuffled_objects_three_objects_harness = LightevalTaskConfig( + name="bbh:tracking_shuffled_objects_three_objects", + suite=["harness"], + prompt_function="bbh_tracking_shuffled_objects_three_objects", + hf_repo="lukaemon/bbh", + hf_subset="tracking_shuffled_objects_three_objects", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_web_of_lies_harness = LightevalTaskConfig( + name="bbh:web_of_lies", + suite=["harness"], + prompt_function="bbh_web_of_lies", + hf_repo="lukaemon/bbh", + hf_subset="web_of_lies", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_word_sorting_harness = LightevalTaskConfig( + name="bbh:word_sorting", + suite=["harness"], + prompt_function="bbh_word_sorting", + hf_repo="lukaemon/bbh", + hf_subset="word_sorting", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbq_helm = LightevalTaskConfig( + name="bbq", + suite=["helm"], + prompt_function="bbq", + hf_repo="lighteval/bbq_helm", + hf_subset="all", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbq_Age_helm = LightevalTaskConfig( + name="bbq:Age", + suite=["helm"], + prompt_function="bbq", + hf_repo="lighteval/bbq_helm", + hf_subset="Age", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbq_Disability_status_helm = LightevalTaskConfig( + name="bbq:Disability_status", + suite=["helm"], + prompt_function="bbq", + hf_repo="lighteval/bbq_helm", + hf_subset="Disability_status", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbq_Gender_identity_helm = LightevalTaskConfig( + name="bbq:Gender_identity", + suite=["helm"], + prompt_function="bbq", + hf_repo="lighteval/bbq_helm", + hf_subset="Gender_identity", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbq_Nationality_helm = LightevalTaskConfig( + name="bbq=Nationality", + suite=["helm"], + prompt_function="bbq", + hf_repo="lighteval/bbq_helm", + hf_subset="Nationality", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbq_Physical_appearance_helm = LightevalTaskConfig( + name="bbq:Physical_appearance", + suite=["helm"], + prompt_function="bbq", + hf_repo="lighteval/bbq_helm", + hf_subset="Physical_appearance", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbq_Race_ethnicity_helm = LightevalTaskConfig( + name="bbq:Race_ethnicity", + suite=["helm"], + prompt_function="bbq", + hf_repo="lighteval/bbq_helm", + hf_subset="Race_ethnicity", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbq_Race_x_SES_helm = LightevalTaskConfig( + name="bbq:Race_x_SES", + suite=["helm"], + prompt_function="bbq", + hf_repo="lighteval/bbq_helm", + hf_subset="Race_x_SES", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbq_Race_x_gender_helm = LightevalTaskConfig( + name="bbq:Race_x_gender", + suite=["helm"], + prompt_function="bbq", + hf_repo="lighteval/bbq_helm", + hf_subset="Race_x_gender", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbq_Religion_helm = LightevalTaskConfig( + name="bbq:Religion", + suite=["helm"], + prompt_function="bbq", + hf_repo="lighteval/bbq_helm", + hf_subset="Religion", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbq_SES_helm = LightevalTaskConfig( + name="bbq:SES", + suite=["helm"], + prompt_function="bbq", + hf_repo="lighteval/bbq_helm", + hf_subset="SES", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbq_Sexual_orientation_helm = LightevalTaskConfig( + name="bbq:Sexual_orientation", + suite=["helm"], + prompt_function="bbq", + hf_repo="lighteval/bbq_helm", + hf_subset="Sexual_orientation", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbq_lite_json_bigbench_lite = LightevalTaskConfig( + name="bbq_lite_json", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench_linefeed_before_whitespace_after_query", + hf_repo="bigbench", + hf_subset="bbq_lite_json", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_auto_debugging_helm = LightevalTaskConfig( + name="bigbench:auto_debugging", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="auto_debugging", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_bbq_lite_json_age_ambig_helm = LightevalTaskConfig( + name="bigbench:bbq_lite_json:age_ambig", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="bbq_lite_json-age_ambig", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_bbq_lite_json_age_disambig_helm = LightevalTaskConfig( + name="bigbench:bbq_lite_json:age_disambig", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="bbq_lite_json-age_disambig", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_bbq_lite_json_disability_status_ambig_helm = LightevalTaskConfig( + name="bigbench:bbq_lite_json:disability_status_ambig", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="bbq_lite_json-disability_status_ambig", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_bbq_lite_json_disability_status_disambig_helm = LightevalTaskConfig( + name="bigbench:bbq_lite_json:disability_status_disambig", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="bbq_lite_json-disability_status_disambig", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_bbq_lite_json_gender_identity_ambig_helm = LightevalTaskConfig( + name="bigbench:bbq_lite_json:gender_identity_ambig", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="bbq_lite_json-gender_identity_ambig", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_bbq_lite_json_gender_identity_disambig_helm = LightevalTaskConfig( + name="bigbench:bbq_lite_json:gender_identity_disambig", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="bbq_lite_json-gender_identity_disambig", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_bbq_lite_json_nationality_ambig_helm = LightevalTaskConfig( + name="bigbench:bbq_lite_json:nationality_ambig", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="bbq_lite_json-nationality_ambig", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_bbq_lite_json_nationality_disambig_helm = LightevalTaskConfig( + name="bigbench:bbq_lite_json:nationality_disambig", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="bbq_lite_json-nationality_disambig", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_bbq_lite_json_physical_appearance_ambig_helm = LightevalTaskConfig( + name="bigbench:bbq_lite_json:physical_appearance_ambig", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="bbq_lite_json-physical_appearance_ambig", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_bbq_lite_json_physical_appearance_disambig_helm = LightevalTaskConfig( + name="bigbench:bbq_lite_json:physical_appearance_disambig", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="bbq_lite_json-physical_appearance_disambig", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_bbq_lite_json_race_ethnicity_ambig_helm = LightevalTaskConfig( + name="bigbench:bbq_lite_json:race_ethnicity_ambig", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="bbq_lite_json-race_ethnicity_ambig", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_bbq_lite_json_race_ethnicity_disambig_helm = LightevalTaskConfig( + name="bigbench:bbq_lite_json:race_ethnicity_disambig", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="bbq_lite_json-race_ethnicity_disambig", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_bbq_lite_json_religion_ambig_helm = LightevalTaskConfig( + name="bigbench:bbq_lite_json:religion_ambig", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="bbq_lite_json-religion_ambig", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_bbq_lite_json_religion_disambig_helm = LightevalTaskConfig( + name="bigbench:bbq_lite_json:religion_disambig", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="bbq_lite_json-religion_disambig", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_bbq_lite_json_ses_ambig_helm = LightevalTaskConfig( + name="bigbench:bbq_lite_json:ses_ambig", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="bbq_lite_json-ses_ambig", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_bbq_lite_json_ses_disambig_helm = LightevalTaskConfig( + name="bigbench:bbq_lite_json:ses_disambig", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="bbq_lite_json-ses_disambig", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_bbq_lite_json_sexual_orientation_ambig_helm = LightevalTaskConfig( + name="bigbench:bbq_lite_json:sexual_orientation_ambig", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="bbq_lite_json-sexual_orientation_ambig", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_bbq_lite_json_sexual_orientation_disambig_helm = LightevalTaskConfig( + name="bigbench:bbq_lite_json:sexual_orientation_disambig", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="bbq_lite_json-sexual_orientation_disambig", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_code_line_description_helm = LightevalTaskConfig( + name="bigbench:code_line_description", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="code_line_description", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_conceptual_combinations_contradictions_helm = LightevalTaskConfig( + name="bigbench:conceptual_combinations:contradictions", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="conceptual_combinations-contradictions", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_conceptual_combinations_emergent_properties_helm = LightevalTaskConfig( + name="bigbench:conceptual_combinations:emergent_properties", + suite=["helm"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="conceptual_combinations-emergent_properties", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_conceptual_combinations_fanciful_fictional_combinations_helm = LightevalTaskConfig( + name="bigbench:conceptual_combinations:fanciful_fictional_combinations", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="conceptual_combinations-fanciful_fictional_combinations", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_conceptual_combinations_homonyms_helm = LightevalTaskConfig( + name="bigbench:conceptual_combinations:homonyms", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="conceptual_combinations-homonyms", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_conceptual_combinations_invented_words_helm = LightevalTaskConfig( + name="bigbench:conceptual_combinations:invented_words", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="conceptual_combinations-invented_words", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_conlang_translation_adna_from_helm = LightevalTaskConfig( + name="bigbench:conlang_translation:adna_from", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="conlang_translation-adna_from", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["rouge1", "rouge2", "rougeL"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_conlang_translation_adna_to_helm = LightevalTaskConfig( + name="bigbench:conlang_translation:adna_to", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="conlang_translation-adna_to", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["rouge1", "rouge2", "rougeL"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_conlang_translation_atikampe_from_helm = LightevalTaskConfig( + name="bigbench:conlang_translation:atikampe_from", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="conlang_translation-atikampe_from", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["rouge1", "rouge2", "rougeL"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_conlang_translation_atikampe_to_helm = LightevalTaskConfig( + name="bigbench:conlang_translation:atikampe_to", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="conlang_translation-atikampe_to", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["rouge1", "rouge2", "rougeL"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_conlang_translation_gornam_from_helm = LightevalTaskConfig( + name="bigbench:conlang_translation:gornam_from", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="conlang_translation-gornam_from", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["rouge1", "rouge2", "rougeL"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_conlang_translation_gornam_to_helm = LightevalTaskConfig( + name="bigbench:conlang_translation:gornam_to", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="conlang_translation-gornam_to", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["rouge1", "rouge2", "rougeL"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_conlang_translation_holuan_from_helm = LightevalTaskConfig( + name="bigbench:conlang_translation:holuan_from", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="conlang_translation-holuan_from", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["rouge1", "rouge2", "rougeL"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_conlang_translation_holuan_to_helm = LightevalTaskConfig( + name="bigbench:conlang_translation:holuan_to", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="conlang_translation-holuan_to", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["rouge1", "rouge2", "rougeL"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_conlang_translation_mkafala_from_helm = LightevalTaskConfig( + name="bigbench:conlang_translation:mkafala_from", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="conlang_translation-mkafala_from", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["rouge1", "rouge2", "rougeL"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_conlang_translation_mkafala_to_helm = LightevalTaskConfig( + name="bigbench:conlang_translation:mkafala_to", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="conlang_translation-mkafala_to", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["rouge1", "rouge2", "rougeL"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_conlang_translation_postpositive_english_from_helm = LightevalTaskConfig( + name="bigbench:conlang_translation:postpositive_english_from", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="conlang_translation-postpositive_english_from", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["rouge1", "rouge2", "rougeL"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_conlang_translation_postpositive_english_to_helm = LightevalTaskConfig( + name="bigbench:conlang_translation:postpositive_english_to", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="conlang_translation-postpositive_english_to", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["rouge1", "rouge2", "rougeL"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_conlang_translation_unapuri_from_helm = LightevalTaskConfig( + name="bigbench:conlang_translation:unapuri_from", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="conlang_translation-unapuri_from", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["rouge1", "rouge2", "rougeL"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_conlang_translation_unapuri_to_helm = LightevalTaskConfig( + name="bigbench:conlang_translation:unapuri_to", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="conlang_translation-unapuri_to", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["rouge1", "rouge2", "rougeL"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_conlang_translation_vaomi_from_helm = LightevalTaskConfig( + name="bigbench:conlang_translation:vaomi_from", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="conlang_translation-vaomi_from", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["rouge1", "rouge2", "rougeL"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_conlang_translation_vaomi_to_helm = LightevalTaskConfig( + name="bigbench:conlang_translation:vaomi_to", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="conlang_translation-vaomi_to", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["rouge1", "rouge2", "rougeL"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_emoji_movie_helm = LightevalTaskConfig( + name="bigbench:emoji_movie", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="emoji_movie", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_formal_fallacies_syllogisms_negation_helm = LightevalTaskConfig( + name="bigbench:formal_fallacies_syllogisms_negation", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="formal_fallacies_syllogisms_negation", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_hindu_knowledge_helm = LightevalTaskConfig( + name="bigbench:hindu_knowledge", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="hindu_knowledge", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_known_unknowns_helm = LightevalTaskConfig( + name="bigbench:known_unknowns", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="known_unknowns", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_language_identification_helm = LightevalTaskConfig( + name="bigbench:language_identification", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="language_identification", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_linguistics_puzzles_helm = LightevalTaskConfig( + name="bigbench:linguistics_puzzles", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="linguistics_puzzles", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_logic_grid_puzzle_helm = LightevalTaskConfig( + name="bigbench:logic_grid_puzzle", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="logic_grid_puzzle", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_logical_deduction_five_objects_helm = LightevalTaskConfig( + name="bigbench:logical_deduction-five_objects", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="logical_deduction-five_objects", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_logical_deduction_seven_objects_helm = LightevalTaskConfig( + name="bigbench:logical_deduction-seven_objects", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="logical_deduction-seven_objects", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_logical_deduction_three_objects_helm = LightevalTaskConfig( + name="bigbench:logical_deduction-three_objects", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="logical_deduction-three_objects", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_misconceptions_russian_helm = LightevalTaskConfig( + name="bigbench:misconceptions_russian", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="misconceptions_russian", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_novel_concepts_helm = LightevalTaskConfig( + name="bigbench:novel_concepts", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="novel_concepts", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_operators_helm = LightevalTaskConfig( + name="bigbench:operators", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="operators", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_parsinlu_reading_comprehension_helm = LightevalTaskConfig( + name="bigbench:parsinlu_reading_comprehension", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="parsinlu_reading_comprehension", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_play_dialog_same_or_different_helm = LightevalTaskConfig( + name="bigbench:play_dialog_same_or_different", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="play_dialog_same_or_different", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_repeat_copy_logic_helm = LightevalTaskConfig( + name="bigbench:repeat_copy_logic", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="repeat_copy_logic", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_strange_stories_boolean_helm = LightevalTaskConfig( + name="bigbench:strange_stories-boolean", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="strange_stories-boolean", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_strange_stories_multiple_choice_helm = LightevalTaskConfig( + name="bigbench:strange_stories-multiple_choice", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="strange_stories-multiple_choice", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_strategyqa_helm = LightevalTaskConfig( + name="bigbench:strategyqa", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="strategyqa", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_symbol_interpretation_adversarial_helm = LightevalTaskConfig( + name="bigbench:symbol_interpretation-adversarial", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="symbol_interpretation-adversarial", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_symbol_interpretation_emoji_agnostic_helm = LightevalTaskConfig( + name="bigbench:symbol_interpretation-emoji_agnostic", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="symbol_interpretation-emoji_agnostic", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_symbol_interpretation_name_agnostic_helm = LightevalTaskConfig( + name="bigbench:symbol_interpretation-name_agnostic", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="symbol_interpretation-name_agnostic", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_symbol_interpretation_plain_helm = LightevalTaskConfig( + name="bigbench:symbol_interpretation-plain", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="symbol_interpretation-plain", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_symbol_interpretation_tricky_helm = LightevalTaskConfig( + name="bigbench:symbol_interpretation-tricky", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="symbol_interpretation-tricky", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_vitaminc_fact_verification_helm = LightevalTaskConfig( + name="bigbench:vitaminc_fact_verification", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="vitaminc_fact_verification", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_winowhy_helm = LightevalTaskConfig( + name="bigbench:winowhy", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="winowhy", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_adjunct_island_lighteval = LightevalTaskConfig( + name="blimp:adjunct_island", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="adjunct_island", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_adjunct_island_helm = LightevalTaskConfig( + name="blimp:adjunct_island", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="adjunct_island", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_anaphor_gender_agreement_lighteval = LightevalTaskConfig( + name="blimp:anaphor_gender_agreement", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="anaphor_gender_agreement", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_anaphor_gender_agreement_helm = LightevalTaskConfig( + name="blimp:anaphor_gender_agreement", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="anaphor_gender_agreement", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_anaphor_number_agreement_lighteval = LightevalTaskConfig( + name="blimp:anaphor_number_agreement", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="anaphor_number_agreement", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_anaphor_number_agreement_helm = LightevalTaskConfig( + name="blimp:anaphor_number_agreement", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="anaphor_number_agreement", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_animate_subject_passive_lighteval = LightevalTaskConfig( + name="blimp:animate_subject_passive", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="animate_subject_passive", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_animate_subject_passive_helm = LightevalTaskConfig( + name="blimp:animate_subject_passive", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="animate_subject_passive", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_animate_subject_trans_lighteval = LightevalTaskConfig( + name="blimp:animate_subject_trans", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="animate_subject_trans", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_animate_subject_trans_helm = LightevalTaskConfig( + name="blimp:animate_subject_trans", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="animate_subject_trans", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_causative_lighteval = LightevalTaskConfig( + name="blimp:causative", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="causative", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_causative_helm = LightevalTaskConfig( + name="blimp:causative", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="causative", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_complex_NP_island_lighteval = LightevalTaskConfig( + name="blimp:complex_NP_island", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="complex_NP_island", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_complex_NP_island_helm = LightevalTaskConfig( + name="blimp:complex_NP_island", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="complex_NP_island", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_coordinate_structure_constraint_complex_left_branch_lighteval = LightevalTaskConfig( + name="blimp:coordinate_structure_constraint_complex_left_branch", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="coordinate_structure_constraint_complex_left_branch", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_coordinate_structure_constraint_complex_left_branch_helm = LightevalTaskConfig( + name="blimp:coordinate_structure_constraint_complex_left_branch", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="coordinate_structure_constraint_complex_left_branch", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_coordinate_structure_constraint_object_extraction_lighteval = LightevalTaskConfig( + name="blimp:coordinate_structure_constraint_object_extraction", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="coordinate_structure_constraint_object_extraction", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_coordinate_structure_constraint_object_extraction_helm = LightevalTaskConfig( + name="blimp:coordinate_structure_constraint_object_extraction", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="coordinate_structure_constraint_object_extraction", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_determiner_noun_agreement_1_lighteval = LightevalTaskConfig( + name="blimp:determiner_noun_agreement_1", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="determiner_noun_agreement_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_determiner_noun_agreement_1_helm = LightevalTaskConfig( + name="blimp:determiner_noun_agreement_1", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="determiner_noun_agreement_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_determiner_noun_agreement_2_lighteval = LightevalTaskConfig( + name="blimp:determiner_noun_agreement_2", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="determiner_noun_agreement_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_determiner_noun_agreement_2_helm = LightevalTaskConfig( + name="blimp:determiner_noun_agreement_2", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="determiner_noun_agreement_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_determiner_noun_agreement_irregular_1_lighteval = LightevalTaskConfig( + name="blimp:determiner_noun_agreement_irregular_1", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="determiner_noun_agreement_irregular_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_determiner_noun_agreement_irregular_1_helm = LightevalTaskConfig( + name="blimp:determiner_noun_agreement_irregular_1", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="determiner_noun_agreement_irregular_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_determiner_noun_agreement_irregular_2_lighteval = LightevalTaskConfig( + name="blimp:determiner_noun_agreement_irregular_2", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="determiner_noun_agreement_irregular_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_determiner_noun_agreement_irregular_2_helm = LightevalTaskConfig( + name="blimp:determiner_noun_agreement_irregular_2", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="determiner_noun_agreement_irregular_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_determiner_noun_agreement_with_adj_2_lighteval = LightevalTaskConfig( + name="blimp:determiner_noun_agreement_with_adj_2", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="determiner_noun_agreement_with_adj_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_determiner_noun_agreement_with_adj_2_helm = LightevalTaskConfig( + name="blimp:determiner_noun_agreement_with_adj_2", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="determiner_noun_agreement_with_adj_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_determiner_noun_agreement_with_adj_irregular_1_lighteval = LightevalTaskConfig( + name="blimp:determiner_noun_agreement_with_adj_irregular_1", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="determiner_noun_agreement_with_adj_irregular_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_determiner_noun_agreement_with_adj_irregular_1_helm = LightevalTaskConfig( + name="blimp:determiner_noun_agreement_with_adj_irregular_1", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="determiner_noun_agreement_with_adj_irregular_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_determiner_noun_agreement_with_adj_irregular_2_lighteval = LightevalTaskConfig( + name="blimp:determiner_noun_agreement_with_adj_irregular_2", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="determiner_noun_agreement_with_adj_irregular_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_determiner_noun_agreement_with_adj_irregular_2_helm = LightevalTaskConfig( + name="blimp:determiner_noun_agreement_with_adj_irregular_2", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="determiner_noun_agreement_with_adj_irregular_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_determiner_noun_agreement_with_adjective_1_lighteval = LightevalTaskConfig( + name="blimp:determiner_noun_agreement_with_adjective_1", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="determiner_noun_agreement_with_adjective_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_determiner_noun_agreement_with_adjective_1_helm = LightevalTaskConfig( + name="blimp:determiner_noun_agreement_with_adjective_1", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="determiner_noun_agreement_with_adjective_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_distractor_agreement_relational_noun_lighteval = LightevalTaskConfig( + name="blimp:distractor_agreement_relational_noun", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="distractor_agreement_relational_noun", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_distractor_agreement_relational_noun_helm = LightevalTaskConfig( + name="blimp:distractor_agreement_relational_noun", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="distractor_agreement_relational_noun", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_distractor_agreement_relative_clause_lighteval = LightevalTaskConfig( + name="blimp:distractor_agreement_relative_clause", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="distractor_agreement_relative_clause", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_distractor_agreement_relative_clause_helm = LightevalTaskConfig( + name="blimp:distractor_agreement_relative_clause", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="distractor_agreement_relative_clause", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_drop_argument_lighteval = LightevalTaskConfig( + name="blimp:drop_argument", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="drop_argument", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_drop_argument_helm = LightevalTaskConfig( + name="blimp:drop_argument", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="drop_argument", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_ellipsis_n_bar_1_lighteval = LightevalTaskConfig( + name="blimp:ellipsis_n_bar_1", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="ellipsis_n_bar_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_ellipsis_n_bar_1_helm = LightevalTaskConfig( + name="blimp:ellipsis_n_bar_1", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="ellipsis_n_bar_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_ellipsis_n_bar_2_lighteval = LightevalTaskConfig( + name="blimp:ellipsis_n_bar_2", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="ellipsis_n_bar_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_ellipsis_n_bar_2_helm = LightevalTaskConfig( + name="blimp:ellipsis_n_bar_2", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="ellipsis_n_bar_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_existential_there_object_raising_lighteval = LightevalTaskConfig( + name="blimp:existential_there_object_raising", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="existential_there_object_raising", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_existential_there_object_raising_helm = LightevalTaskConfig( + name="blimp:existential_there_object_raising", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="existential_there_object_raising", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_existential_there_quantifiers_1_lighteval = LightevalTaskConfig( + name="blimp:existential_there_quantifiers_1", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="existential_there_quantifiers_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_existential_there_quantifiers_1_helm = LightevalTaskConfig( + name="blimp:existential_there_quantifiers_1", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="existential_there_quantifiers_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_existential_there_quantifiers_2_lighteval = LightevalTaskConfig( + name="blimp:existential_there_quantifiers_2", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="existential_there_quantifiers_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_existential_there_quantifiers_2_helm = LightevalTaskConfig( + name="blimp:existential_there_quantifiers_2", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="existential_there_quantifiers_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_existential_there_subject_raising_lighteval = LightevalTaskConfig( + name="blimp:existential_there_subject_raising", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="existential_there_subject_raising", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_existential_there_subject_raising_helm = LightevalTaskConfig( + name="blimp:existential_there_subject_raising", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="existential_there_subject_raising", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_expletive_it_object_raising_lighteval = LightevalTaskConfig( + name="blimp:expletive_it_object_raising", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="expletive_it_object_raising", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_expletive_it_object_raising_helm = LightevalTaskConfig( + name="blimp:expletive_it_object_raising", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="expletive_it_object_raising", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_inchoative_lighteval = LightevalTaskConfig( + name="blimp:inchoative", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="inchoative", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_inchoative_helm = LightevalTaskConfig( + name="blimp:inchoative", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="inchoative", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_intransitive_lighteval = LightevalTaskConfig( + name="blimp:intransitive", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="intransitive", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_intransitive_helm = LightevalTaskConfig( + name="blimp:intransitive", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="intransitive", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_irregular_past_participle_adjectives_lighteval = LightevalTaskConfig( + name="blimp:irregular_past_participle_adjectives", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="irregular_past_participle_adjectives", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_irregular_past_participle_adjectives_helm = LightevalTaskConfig( + name="blimp:irregular_past_participle_adjectives", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="irregular_past_participle_adjectives", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_irregular_past_participle_verbs_lighteval = LightevalTaskConfig( + name="blimp:irregular_past_participle_verbs", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="irregular_past_participle_verbs", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_irregular_past_participle_verbs_helm = LightevalTaskConfig( + name="blimp:irregular_past_participle_verbs", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="irregular_past_participle_verbs", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_irregular_plural_subject_verb_agreement_1_lighteval = LightevalTaskConfig( + name="blimp:irregular_plural_subject_verb_agreement_1", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="irregular_plural_subject_verb_agreement_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_irregular_plural_subject_verb_agreement_1_helm = LightevalTaskConfig( + name="blimp:irregular_plural_subject_verb_agreement_1", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="irregular_plural_subject_verb_agreement_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_irregular_plural_subject_verb_agreement_2_lighteval = LightevalTaskConfig( + name="blimp:irregular_plural_subject_verb_agreement_2", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="irregular_plural_subject_verb_agreement_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_irregular_plural_subject_verb_agreement_2_helm = LightevalTaskConfig( + name="blimp:irregular_plural_subject_verb_agreement_2", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="irregular_plural_subject_verb_agreement_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_left_branch_island_echo_question_lighteval = LightevalTaskConfig( + name="blimp:left_branch_island_echo_question", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="left_branch_island_echo_question", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_left_branch_island_echo_question_helm = LightevalTaskConfig( + name="blimp:left_branch_island_echo_question", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="left_branch_island_echo_question", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_left_branch_island_simple_question_lighteval = LightevalTaskConfig( + name="blimp:left_branch_island_simple_question", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="left_branch_island_simple_question", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_left_branch_island_simple_question_helm = LightevalTaskConfig( + name="blimp:left_branch_island_simple_question", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="left_branch_island_simple_question", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_matrix_question_npi_licensor_present_lighteval = LightevalTaskConfig( + name="blimp:matrix_question_npi_licensor_present", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="matrix_question_npi_licensor_present", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_matrix_question_npi_licensor_present_helm = LightevalTaskConfig( + name="blimp:matrix_question_npi_licensor_present", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="matrix_question_npi_licensor_present", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_npi_present_1_lighteval = LightevalTaskConfig( + name="blimp:npi_present_1", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="npi_present_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_npi_present_1_helm = LightevalTaskConfig( + name="blimp:npi_present_1", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="npi_present_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_npi_present_2_lighteval = LightevalTaskConfig( + name="blimp:npi_present_2", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="npi_present_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_npi_present_2_helm = LightevalTaskConfig( + name="blimp:npi_present_2", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="npi_present_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_only_npi_licensor_present_lighteval = LightevalTaskConfig( + name="blimp:only_npi_licensor_present", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="only_npi_licensor_present", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_only_npi_licensor_present_helm = LightevalTaskConfig( + name="blimp:only_npi_licensor_present", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="only_npi_licensor_present", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_only_npi_scope_lighteval = LightevalTaskConfig( + name="blimp:only_npi_scope", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="only_npi_scope", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_only_npi_scope_helm = LightevalTaskConfig( + name="blimp:only_npi_scope", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="only_npi_scope", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_passive_1_lighteval = LightevalTaskConfig( + name="blimp:passive_1", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="passive_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_passive_1_helm = LightevalTaskConfig( + name="blimp:passive_1", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="passive_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_passive_2_lighteval = LightevalTaskConfig( + name="blimp:passive_2", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="passive_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_passive_2_helm = LightevalTaskConfig( + name="blimp:passive_2", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="passive_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_principle_A_c_command_lighteval = LightevalTaskConfig( + name="blimp:principle_A_c_command", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="principle_A_c_command", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_principle_A_c_command_helm = LightevalTaskConfig( + name="blimp:principle_A_c_command", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="principle_A_c_command", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_principle_A_case_1_lighteval = LightevalTaskConfig( + name="blimp:principle_A_case_1", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="principle_A_case_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_principle_A_case_1_helm = LightevalTaskConfig( + name="blimp:principle_A_case_1", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="principle_A_case_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_principle_A_case_2_lighteval = LightevalTaskConfig( + name="blimp:principle_A_case_2", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="principle_A_case_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_principle_A_case_2_helm = LightevalTaskConfig( + name="blimp:principle_A_case_2", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="principle_A_case_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_principle_A_domain_1_lighteval = LightevalTaskConfig( + name="blimp:principle_A_domain_1", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="principle_A_domain_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_principle_A_domain_1_helm = LightevalTaskConfig( + name="blimp:principle_A_domain_1", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="principle_A_domain_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_principle_A_domain_2_lighteval = LightevalTaskConfig( + name="blimp:principle_A_domain_2", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="principle_A_domain_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_principle_A_domain_2_helm = LightevalTaskConfig( + name="blimp:principle_A_domain_2", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="principle_A_domain_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_principle_A_domain_3_lighteval = LightevalTaskConfig( + name="blimp:principle_A_domain_3", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="principle_A_domain_3", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_principle_A_domain_3_helm = LightevalTaskConfig( + name="blimp:principle_A_domain_3", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="principle_A_domain_3", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_principle_A_reconstruction_lighteval = LightevalTaskConfig( + name="blimp:principle_A_reconstruction", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="principle_A_reconstruction", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_principle_A_reconstruction_helm = LightevalTaskConfig( + name="blimp:principle_A_reconstruction", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="principle_A_reconstruction", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_regular_plural_subject_verb_agreement_1_lighteval = LightevalTaskConfig( + name="blimp:regular_plural_subject_verb_agreement_1", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="regular_plural_subject_verb_agreement_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_regular_plural_subject_verb_agreement_1_helm = LightevalTaskConfig( + name="blimp:regular_plural_subject_verb_agreement_1", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="regular_plural_subject_verb_agreement_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_regular_plural_subject_verb_agreement_2_lighteval = LightevalTaskConfig( + name="blimp:regular_plural_subject_verb_agreement_2", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="regular_plural_subject_verb_agreement_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_regular_plural_subject_verb_agreement_2_helm = LightevalTaskConfig( + name="blimp:regular_plural_subject_verb_agreement_2", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="regular_plural_subject_verb_agreement_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_sentential_negation_npi_licensor_present_lighteval = LightevalTaskConfig( + name="blimp:sentential_negation_npi_licensor_present", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="sentential_negation_npi_licensor_present", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_sentential_negation_npi_licensor_present_helm = LightevalTaskConfig( + name="blimp:sentential_negation_npi_licensor_present", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="sentential_negation_npi_licensor_present", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_sentential_negation_npi_scope_lighteval = LightevalTaskConfig( + name="blimp:sentential_negation_npi_scope", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="sentential_negation_npi_scope", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_sentential_negation_npi_scope_helm = LightevalTaskConfig( + name="blimp:sentential_negation_npi_scope", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="sentential_negation_npi_scope", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_sentential_subject_island_lighteval = LightevalTaskConfig( + name="blimp:sentential_subject_island", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="sentential_subject_island", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_sentential_subject_island_helm = LightevalTaskConfig( + name="blimp:sentential_subject_island", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="sentential_subject_island", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_superlative_quantifiers_1_lighteval = LightevalTaskConfig( + name="blimp:superlative_quantifiers_1", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="superlative_quantifiers_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_superlative_quantifiers_1_helm = LightevalTaskConfig( + name="blimp:superlative_quantifiers_1", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="superlative_quantifiers_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_superlative_quantifiers_2_lighteval = LightevalTaskConfig( + name="blimp:superlative_quantifiers_2", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="superlative_quantifiers_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_superlative_quantifiers_2_helm = LightevalTaskConfig( + name="blimp:superlative_quantifiers_2", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="superlative_quantifiers_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_tough_vs_raising_1_lighteval = LightevalTaskConfig( + name="blimp:tough_vs_raising_1", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="tough_vs_raising_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_tough_vs_raising_1_helm = LightevalTaskConfig( + name="blimp:tough_vs_raising_1", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="tough_vs_raising_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_tough_vs_raising_2_lighteval = LightevalTaskConfig( + name="blimp:tough_vs_raising_2", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="tough_vs_raising_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_tough_vs_raising_2_helm = LightevalTaskConfig( + name="blimp:tough_vs_raising_2", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="tough_vs_raising_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_transitive_lighteval = LightevalTaskConfig( + name="blimp:transitive", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="transitive", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_transitive_helm = LightevalTaskConfig( + name="blimp:transitive", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="transitive", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_wh_island_lighteval = LightevalTaskConfig( + name="blimp:wh_island", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="wh_island", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_wh_island_helm = LightevalTaskConfig( + name="blimp:wh_island", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="wh_island", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_wh_questions_object_gap_lighteval = LightevalTaskConfig( + name="blimp:wh_questions_object_gap", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="wh_questions_object_gap", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_wh_questions_object_gap_helm = LightevalTaskConfig( + name="blimp:wh_questions_object_gap", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="wh_questions_object_gap", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_wh_questions_subject_gap_lighteval = LightevalTaskConfig( + name="blimp:wh_questions_subject_gap", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="wh_questions_subject_gap", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_wh_questions_subject_gap_helm = LightevalTaskConfig( + name="blimp:wh_questions_subject_gap", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="wh_questions_subject_gap", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_wh_questions_subject_gap_long_distance_lighteval = LightevalTaskConfig( + name="blimp:wh_questions_subject_gap_long_distance", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="wh_questions_subject_gap_long_distance", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_wh_questions_subject_gap_long_distance_helm = LightevalTaskConfig( + name="blimp:wh_questions_subject_gap_long_distance", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="wh_questions_subject_gap_long_distance", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_wh_vs_that_no_gap_lighteval = LightevalTaskConfig( + name="blimp:wh_vs_that_no_gap", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="wh_vs_that_no_gap", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_wh_vs_that_no_gap_helm = LightevalTaskConfig( + name="blimp:wh_vs_that_no_gap", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="wh_vs_that_no_gap", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_wh_vs_that_no_gap_long_distance_lighteval = LightevalTaskConfig( + name="blimp:wh_vs_that_no_gap_long_distance", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="wh_vs_that_no_gap_long_distance", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_wh_vs_that_no_gap_long_distance_helm = LightevalTaskConfig( + name="blimp:wh_vs_that_no_gap_long_distance", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="wh_vs_that_no_gap_long_distance", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_wh_vs_that_with_gap_lighteval = LightevalTaskConfig( + name="blimp:wh_vs_that_with_gap", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="wh_vs_that_with_gap", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_wh_vs_that_with_gap_helm = LightevalTaskConfig( + name="blimp:wh_vs_that_with_gap", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="wh_vs_that_with_gap", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_wh_vs_that_with_gap_long_distance_lighteval = LightevalTaskConfig( + name="blimp:wh_vs_that_with_gap_long_distance", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="wh_vs_that_with_gap_long_distance", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_wh_vs_that_with_gap_long_distance_helm = LightevalTaskConfig( + name="blimp:wh_vs_that_with_gap_long_distance", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="wh_vs_that_with_gap_long_distance", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bold_helm = LightevalTaskConfig( + name="bold", + suite=["helm"], + prompt_function="bold", + hf_repo="lighteval/bold_helm", + hf_subset="all", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["prediction_perplexity"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bold_gender_helm = LightevalTaskConfig( + name="bold:gender", + suite=["helm"], + prompt_function="bold", + hf_repo="lighteval/bold_helm", + hf_subset="gender", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["prediction_perplexity"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bold_political_ideology_helm = LightevalTaskConfig( + name="bold:political_ideology", + suite=["helm"], + prompt_function="bold", + hf_repo="lighteval/bold_helm", + hf_subset="political_ideology", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["prediction_perplexity"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bold_profession_helm = LightevalTaskConfig( + name="bold:profession", + suite=["helm"], + prompt_function="bold", + hf_repo="lighteval/bold_helm", + hf_subset="profession", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["prediction_perplexity"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bold_race_helm = LightevalTaskConfig( + name="bold:race", + suite=["helm"], + prompt_function="bold", + hf_repo="lighteval/bold_helm", + hf_subset="race", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["prediction_perplexity"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bold_religious_ideology_helm = LightevalTaskConfig( + name="bold:religious_ideology", + suite=["helm"], + prompt_function="bold", + hf_repo="lighteval/bold_helm", + hf_subset="religious_ideology", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["prediction_perplexity"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +boolq_helm = LightevalTaskConfig( + name="boolq", + suite=["helm", "helm_general"], + prompt_function="boolq_helm", + hf_repo="lighteval/boolq_helm", + hf_subset="default", + hf_avail_splits=["train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +boolq_contrastset_helm = LightevalTaskConfig( + name="boolq:contrastset", + suite=["helm"], + prompt_function="boolq_helm_contrastset", + hf_repo="lighteval/boolq_helm", + hf_subset="default", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bridging_anaphora_resolution_barqa_bigbench = LightevalTaskConfig( + name="bridging_anaphora_resolution_barqa", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="bridging_anaphora_resolution_barqa", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +causal_judgment_bigbench = LightevalTaskConfig( + name="causal_judgment", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="causal_judgment", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +cause_and_effect_bigbench = LightevalTaskConfig( + name="cause_and_effect", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="cause_and_effect", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +checkmate_in_one_bigbench = LightevalTaskConfig( + name="checkmate_in_one", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="checkmate_in_one", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +chess_state_tracking_bigbench = LightevalTaskConfig( + name="chess_state_tracking", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="chess_state_tracking", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +chinese_remainder_theorem_bigbench = LightevalTaskConfig( + name="chinese_remainder_theorem", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="chinese_remainder_theorem", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +cifar10_classification_bigbench = LightevalTaskConfig( + name="cifar10_classification", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="cifar10_classification", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +civil_comments_helm = LightevalTaskConfig( + name="civil_comments", + suite=["helm", "helm_general"], + prompt_function="civil_comments", + hf_repo="lighteval/civil_comments_helm", + hf_subset="all", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "f1_score_macro", + "f1_score_micro", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +civil_comments_LGBTQ_helm = LightevalTaskConfig( + name="civil_comments:LGBTQ", + suite=["helm"], + prompt_function="civil_comments", + hf_repo="lighteval/civil_comments_helm", + hf_subset="LGBTQ", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "f1_score_macro", + "f1_score_micro", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +civil_comments_black_helm = LightevalTaskConfig( + name="civil_comments:black", + suite=["helm"], + prompt_function="civil_comments", + hf_repo="lighteval/civil_comments_helm", + hf_subset="black", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "f1_score_macro", + "f1_score_micro", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +civil_comments_christian_helm = LightevalTaskConfig( + name="civil_comments:christian", + suite=["helm"], + prompt_function="civil_comments", + hf_repo="lighteval/civil_comments_helm", + hf_subset="christian", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "f1_score_macro", + "f1_score_micro", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +civil_comments_female_helm = LightevalTaskConfig( + name="civil_comments:female", + suite=["helm"], + prompt_function="civil_comments", + hf_repo="lighteval/civil_comments_helm", + hf_subset="female", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "f1_score_macro", + "f1_score_micro", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +civil_comments_male_helm = LightevalTaskConfig( + name="civil_comments:male", + suite=["helm"], + prompt_function="civil_comments", + hf_repo="lighteval/civil_comments_helm", + hf_subset="male", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "f1_score_macro", + "f1_score_micro", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +civil_comments_muslim_helm = LightevalTaskConfig( + name="civil_comments:muslim", + suite=["helm"], + prompt_function="civil_comments", + hf_repo="lighteval/civil_comments_helm", + hf_subset="muslim", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "f1_score_macro", + "f1_score_micro", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +civil_comments_other_religions_helm = LightevalTaskConfig( + name="civil_comments:other_religions", + suite=["helm"], + prompt_function="civil_comments", + hf_repo="lighteval/civil_comments_helm", + hf_subset="other_religions", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "f1_score_macro", + "f1_score_micro", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +civil_comments_white_helm = LightevalTaskConfig( + name="civil_comments:white", + suite=["helm"], + prompt_function="civil_comments", + hf_repo="lighteval/civil_comments_helm", + hf_subset="white", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "f1_score_macro", + "f1_score_micro", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +code_line_description_bigbench_lite = LightevalTaskConfig( + name="code_line_description", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench_linefeed_before_and_after_query", + hf_repo="bigbench", + hf_subset="code_line_description", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +codenames_bigbench = LightevalTaskConfig( + name="codenames", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="codenames", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["rouge_t5", "bleu", "bleu"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +color_bigbench = LightevalTaskConfig( + name="color", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="color", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["rouge_t5", "bleu", "loglikelihood_acc", "perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +common_morpheme_bigbench = LightevalTaskConfig( + name="common_morpheme", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="common_morpheme", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +commonsenseqa_helm = LightevalTaskConfig( + name="commonsenseqa", + suite=["helm", "commonsense_scenario"], + prompt_function="commonsense_qa", + hf_repo="commonsense_qa", + hf_subset="default", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +conceptual_combinations_bigbench_lite = LightevalTaskConfig( + name="conceptual_combinations", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench_linefeed_before_whitespace_after_query", + hf_repo="bigbench", + hf_subset="conceptual_combinations", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +conlang_translation_bigbench_lite = LightevalTaskConfig( + name="conlang_translation", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench_whitespace_after_query", + hf_repo="bigbench", + hf_subset="conlang_translation", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["rouge_t5", "bleu", "perfect_exact_match"], + stop_sequence=[".", ";", "!", "?"], + output_regex="[^\\.\\?\\!\\;\\n]+", + trust_dataset=True, + version=0, +) +contextual_parametric_knowledge_conflicts_bigbench = LightevalTaskConfig( + name="contextual_parametric_knowledge_conflicts", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="contextual_parametric_knowledge_conflicts", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["rouge_t5", "loglikelihood_acc", "perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +copyright_n_books_1000_extractions_per_book_1_prefix_length_125_helm = LightevalTaskConfig( + name="copyright:n_books_1000-extractions_per_book_1-prefix_length_125", + suite=["helm", "copyright_scenario"], + prompt_function="copyright", + hf_repo="lighteval/copyright_helm", + hf_subset="n_books_1000-extractions_per_book_1-prefix_length_125", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["copyright"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +copyright_n_books_1000_extractions_per_book_1_prefix_length_25_helm = LightevalTaskConfig( + name="copyright:n_books_1000-extractions_per_book_1-prefix_length_25", + suite=["helm", "copyright_scenario"], + prompt_function="copyright", + hf_repo="lighteval/copyright_helm", + hf_subset="n_books_1000-extractions_per_book_1-prefix_length_25", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["copyright"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +copyright_n_books_1000_extractions_per_book_1_prefix_length_5_helm = LightevalTaskConfig( + name="copyright:n_books_1000-extractions_per_book_1-prefix_length_5", + suite=["helm", "copyright_scenario"], + prompt_function="copyright", + hf_repo="lighteval/copyright_helm", + hf_subset="n_books_1000-extractions_per_book_1-prefix_length_5", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["copyright"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +copyright_n_books_1000_extractions_per_book_3_prefix_length_125_helm = LightevalTaskConfig( + name="copyright:n_books_1000-extractions_per_book_3-prefix_length_125", + suite=["helm", "copyright_scenario"], + prompt_function="copyright", + hf_repo="lighteval/copyright_helm", + hf_subset="n_books_1000-extractions_per_book_3-prefix_length_125", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["copyright"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +copyright_n_books_1000_extractions_per_book_3_prefix_length_25_helm = LightevalTaskConfig( + name="copyright:n_books_1000-extractions_per_book_3-prefix_length_25", + suite=["helm", "copyright_scenario"], + prompt_function="copyright", + hf_repo="lighteval/copyright_helm", + hf_subset="n_books_1000-extractions_per_book_3-prefix_length_25", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["copyright"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +copyright_n_books_1000_extractions_per_book_3_prefix_length_5_helm = LightevalTaskConfig( + name="copyright:n_books_1000-extractions_per_book_3-prefix_length_5", + suite=["helm", "copyright_scenario"], + prompt_function="copyright", + hf_repo="lighteval/copyright_helm", + hf_subset="n_books_1000-extractions_per_book_3-prefix_length_5", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["copyright"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +copyright_oh_the_places_helm = LightevalTaskConfig( + name="copyright:oh_the_places", + suite=["helm", "copyright_scenario"], + prompt_function="copyright", + hf_repo="lighteval/copyright_helm", + hf_subset="oh_the_places", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["copyright"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +copyright_pilot_helm = LightevalTaskConfig( + name="copyright:pilot", + suite=["helm", "copyright_scenario"], + prompt_function="copyright", + hf_repo="lighteval/copyright_helm", + hf_subset="pilot", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["copyright"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +copyright_popular_books_prefix_length_10_helm = LightevalTaskConfig( + name="copyright:popular_books-prefix_length_10", + suite=["helm", "copyright_scenario"], + prompt_function="copyright", + hf_repo="lighteval/copyright_helm", + hf_subset="popular_books-prefix_length_10", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["copyright"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +copyright_popular_books_prefix_length_125_helm = LightevalTaskConfig( + name="copyright:popular_books-prefix_length_125", + suite=["helm", "copyright_scenario"], + prompt_function="copyright", + hf_repo="lighteval/copyright_helm", + hf_subset="popular_books-prefix_length_125", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["copyright"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +copyright_popular_books_prefix_length_25_helm = LightevalTaskConfig( + name="copyright:popular_books-prefix_length_25", + suite=["helm", "copyright_scenario"], + prompt_function="copyright", + hf_repo="lighteval/copyright_helm", + hf_subset="popular_books-prefix_length_25", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["copyright"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +copyright_popular_books_prefix_length_250_helm = LightevalTaskConfig( + name="copyright:popular_books-prefix_length_250", + suite=["helm", "copyright_scenario"], + prompt_function="copyright", + hf_repo="lighteval/copyright_helm", + hf_subset="popular_books-prefix_length_250", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["copyright"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +copyright_popular_books_prefix_length_5_helm = LightevalTaskConfig( + name="copyright:popular_books-prefix_length_5", + suite=["helm", "copyright_scenario"], + prompt_function="copyright", + hf_repo="lighteval/copyright_helm", + hf_subset="popular_books-prefix_length_5", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["copyright"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +copyright_popular_books_prefix_length_50_helm = LightevalTaskConfig( + name="copyright:popular_books-prefix_length_50", + suite=["helm", "copyright_scenario"], + prompt_function="copyright", + hf_repo="lighteval/copyright_helm", + hf_subset="popular_books-prefix_length_50", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["copyright"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +copyright_prompt_num_line_1_min_lines_20_helm = LightevalTaskConfig( + name="copyright:prompt_num_line_1-min_lines_20", + suite=["helm", "copyright_scenario"], + prompt_function="copyright", + hf_repo="lighteval/copyright_helm", + hf_subset="prompt_num_line_1-min_lines_20", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["copyright"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +copyright_prompt_num_line_10_min_lines_20_helm = LightevalTaskConfig( + name="copyright:prompt_num_line_10-min_lines_20", + suite=["helm", "copyright_scenario"], + prompt_function="copyright", + hf_repo="lighteval/copyright_helm", + hf_subset="prompt_num_line_10-min_lines_20", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["copyright"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +copyright_prompt_num_line_5_min_lines_20_helm = LightevalTaskConfig( + name="copyright:prompt_num_line_5-min_lines_20", + suite=["helm", "copyright_scenario"], + prompt_function="copyright", + hf_repo="lighteval/copyright_helm", + hf_subset="prompt_num_line_5-min_lines_20", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["copyright"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +coqa_lighteval = LightevalTaskConfig( + name="coqa", + suite=["lighteval"], + prompt_function="coqa", + hf_repo="coqa", + hf_subset="default", + hf_avail_splits=["train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=10, + metric=["perfect_exact_match", "f1_score"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +coqa_bb_lighteval = LightevalTaskConfig( + name="coqa_bb", + suite=["lighteval", "bigbench_programmatic", "bigbench"], + prompt_function="coqa", + hf_repo="coqa", + hf_subset="default", + hf_avail_splits=["train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=10, + metric=["perfect_exact_match", "f1_score"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +covid_dialogue_helm = LightevalTaskConfig( + name="covid_dialogue", + suite=["helm"], + prompt_function="covid_dialogue", + hf_repo="lighteval/covid_dialogue", + hf_subset="default", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=["exact_match", "quasi_exact_match", "f1_score", "rougeL", "bleu_1", "bleu_4"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +crash_blossom_bigbench = LightevalTaskConfig( + name="crash_blossom", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="crash_blossom", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +crass_ai_bigbench = LightevalTaskConfig( + name="crass_ai", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="crass_ai", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +cryobiology_spanish_bigbench = LightevalTaskConfig( + name="cryobiology_spanish", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="cryobiology_spanish", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +cryptonite_bigbench = LightevalTaskConfig( + name="cryptonite", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="cryptonite", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +cs_algorithms_bigbench = LightevalTaskConfig( + name="cs_algorithms", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="cs_algorithms", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +dark_humor_detection_bigbench = LightevalTaskConfig( + name="dark_humor_detection", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="dark_humor_detection", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +date_understanding_bigbench = LightevalTaskConfig( + name="date_understanding", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="date_understanding", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +disambiguation_qa_bigbench = LightevalTaskConfig( + name="disambiguation_qa", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="disambiguation_qa", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +discourse_marker_prediction_bigbench = LightevalTaskConfig( + name="discourse_marker_prediction", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="discourse_marker_prediction", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +disfl_qa_bigbench = LightevalTaskConfig( + name="disfl_qa", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="disfl_qa", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +drop_lighteval = LightevalTaskConfig( + name="drop", + suite=["lighteval"], + prompt_function="drop", + hf_repo="lighteval/drop_harness", + hf_subset="default", + hf_avail_splits=["train", "validation"], + evaluation_splits=["validation"], + few_shots_split="train", + few_shots_select="random_sampling_from_train", + generation_size=None, + metric=["drop"], + stop_sequence=["."], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +dyck_language_2_helm = LightevalTaskConfig( + name="dyck_language:2", + suite=["helm"], + prompt_function="dyck_language", + hf_repo="lighteval/DyckLanguage", + hf_subset="2", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +dyck_language_3_helm = LightevalTaskConfig( + name="dyck_language:3", + suite=["helm"], + prompt_function="dyck_language", + hf_repo="lighteval/DyckLanguage", + hf_subset="3", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +dyck_language_4_helm = LightevalTaskConfig( + name="dyck_language:4", + suite=["helm"], + prompt_function="dyck_language", + hf_repo="lighteval/DyckLanguage", + hf_subset="4", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +dyck_languages_bigbench = LightevalTaskConfig( + name="dyck_languages", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="dyck_languages", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +elementary_math_qa_bigbench = LightevalTaskConfig( + name="elementary_math_qa", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="elementary_math_qa", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +emoji_movie_bigbench_lite = LightevalTaskConfig( + name="emoji_movie", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench_linefeed_before_whitespace_after_query", + hf_repo="bigbench", + hf_subset="emoji_movie", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["rouge_t5", "bleu", "loglikelihood_acc", "perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +emojis_emotion_prediction_bigbench = LightevalTaskConfig( + name="emojis_emotion_prediction", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="emojis_emotion_prediction", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +empirical_judgments_bigbench = LightevalTaskConfig( + name="empirical_judgments", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="empirical_judgments", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +english_proverbs_bigbench = LightevalTaskConfig( + name="english_proverbs", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="english_proverbs", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +english_russian_proverbs_bigbench = LightevalTaskConfig( + name="english_russian_proverbs", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="english_russian_proverbs", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +entailed_polarity_bigbench = LightevalTaskConfig( + name="entailed_polarity", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="entailed_polarity", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +entailed_polarity_hindi_bigbench = LightevalTaskConfig( + name="entailed_polarity_hindi", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="entailed_polarity_hindi", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +entity_data_imputation_Buy_helm = LightevalTaskConfig( + name="entity_data_imputation:Buy", + suite=["helm"], + prompt_function="entity_data_imputation", + hf_repo="lighteval/Buy", + hf_subset="default", + hf_avail_splits=["train", "test", "valid"], + evaluation_splits=["valid", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +entity_data_imputation_Restaurant_helm = LightevalTaskConfig( + name="entity_data_imputation:Restaurant", + suite=["helm"], + prompt_function="entity_data_imputation", + hf_repo="lighteval/Restaurant", + hf_subset="default", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +entity_matching_Abt_Buy_helm = LightevalTaskConfig( + name="entity_matching:Abt_Buy", + suite=["helm"], + prompt_function="entity_matching", + hf_repo="lighteval/EntityMatching", + hf_subset="Abt_Buy", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +entity_matching_Amazon_Google_helm = LightevalTaskConfig( + name="entity_matching:Amazon_Google", + suite=["helm"], + prompt_function="entity_matching", + hf_repo="lighteval/EntityMatching", + hf_subset="Amazon_Google", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +entity_matching_Beer_helm = LightevalTaskConfig( + name="entity_matching:Beer", + suite=["helm"], + prompt_function="entity_matching", + hf_repo="lighteval/EntityMatching", + hf_subset="Beer", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +entity_matching_Company_helm = LightevalTaskConfig( + name="entity_matching:Company", + suite=["helm"], + prompt_function="entity_matching", + hf_repo="lighteval/EntityMatching", + hf_subset="Company", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +entity_matching_DBLP_ACM_helm = LightevalTaskConfig( + name="entity_matching:DBLP_ACM", + suite=["helm"], + prompt_function="entity_matching", + hf_repo="lighteval/EntityMatching", + hf_subset="DBLP_ACM", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +entity_matching_DBLP_GoogleScholar_helm = LightevalTaskConfig( + name="entity_matching:DBLP_GoogleScholar", + suite=["helm"], + prompt_function="entity_matching", + hf_repo="lighteval/EntityMatching", + hf_subset="DBLP_GoogleScholar", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +entity_matching_Dirty_DBLP_ACM_helm = LightevalTaskConfig( + name="entity_matching:Dirty_DBLP_ACM", + suite=["helm"], + prompt_function="entity_matching", + hf_repo="lighteval/EntityMatching", + hf_subset="Dirty_DBLP_ACM", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +entity_matching_Dirty_DBLP_GoogleScholar_helm = LightevalTaskConfig( + name="entity_matching:Dirty_DBLP_GoogleScholar", + suite=["helm"], + prompt_function="entity_matching", + hf_repo="lighteval/EntityMatching", + hf_subset="Dirty_DBLP_GoogleScholar", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +entity_matching_Dirty_Walmart_Amazon_helm = LightevalTaskConfig( + name="entity_matching:Dirty_Walmart_Amazon", + suite=["helm"], + prompt_function="entity_matching", + hf_repo="lighteval/EntityMatching", + hf_subset="Dirty_Walmart_Amazon", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +entity_matching_Dirty_iTunes_Amazon_helm = LightevalTaskConfig( + name="entity_matching:Dirty_iTunes_Amazon", + suite=["helm"], + prompt_function="entity_matching", + hf_repo="lighteval/EntityMatching", + hf_subset="Dirty_iTunes_Amazon", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +entity_matching_Fodors_Zagats_helm = LightevalTaskConfig( + name="entity_matching=Fodors_Zagats", + suite=["helm"], + prompt_function="entity_matching", + hf_repo="lighteval/EntityMatching", + hf_subset="Fodors_Zagats", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +entity_matching_Walmart_Amazon_helm = LightevalTaskConfig( + name="entity_matching:Walmart_Amazon", + suite=["helm"], + prompt_function="entity_matching", + hf_repo="lighteval/EntityMatching", + hf_subset="Walmart_Amazon", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +entity_matching_iTunes_Amazon_helm = LightevalTaskConfig( + name="entity_matching:iTunes_Amazon", + suite=["helm"], + prompt_function="entity_matching", + hf_repo="lighteval/EntityMatching", + hf_subset="iTunes_Amazon", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +epistemic_reasoning_bigbench = LightevalTaskConfig( + name="epistemic_reasoning", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="epistemic_reasoning", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +ethics_commonsense_lighteval = LightevalTaskConfig( + name="ethics:commonsense", + suite=["lighteval", "ethics"], + prompt_function="ethics_commonsense", + hf_repo="lighteval/hendrycks_ethics", + hf_subset="commonsense", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +ethics_deontology_lighteval = LightevalTaskConfig( + name="ethics:deontology", + suite=["lighteval", "ethics"], + prompt_function="ethics_deontology", + hf_repo="lighteval/hendrycks_ethics", + hf_subset="deontology", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +ethics_justice_lighteval = LightevalTaskConfig( + name="ethics:justice", + suite=["lighteval", "ethics"], + prompt_function="ethics_justice", + hf_repo="lighteval/hendrycks_ethics", + hf_subset="justice", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +ethics_utilitarianism_lighteval = LightevalTaskConfig( + name="ethics:utilitarianism", + suite=["lighteval", "ethics"], + prompt_function="ethics_utilitarianism", + hf_repo="lighteval/hendrycks_ethics", + hf_subset="utilitarianism", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +ethics_virtue_lighteval = LightevalTaskConfig( + name="ethics:virtue", + suite=["lighteval", "ethics"], + prompt_function="ethics_virtue", + hf_repo="lighteval/hendrycks_ethics", + hf_subset="virtue", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +evaluating_information_essentiality_bigbench = LightevalTaskConfig( + name="evaluating_information_essentiality", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="evaluating_information_essentiality", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +fact_checker_bigbench = LightevalTaskConfig( + name="fact_checker", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="fact_checker", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +fantasy_reasoning_bigbench = LightevalTaskConfig( + name="fantasy_reasoning", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="fantasy_reasoning", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +few_shot_nlg_bigbench = LightevalTaskConfig( + name="few_shot_nlg", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="few_shot_nlg", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["bleu", "bleurt"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +figure_of_speech_detection_bigbench = LightevalTaskConfig( + name="figure_of_speech_detection", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="figure_of_speech_detection", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +formal_fallacies_syllogisms_negation_bigbench_lite = LightevalTaskConfig( + name="formal_fallacies_syllogisms_negation", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench_linefeed_before_whitespace_after_query", + hf_repo="bigbench", + hf_subset="formal_fallacies_syllogisms_negation", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +gem_bigbench = LightevalTaskConfig( + name="gem", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="gem", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["bleu", "rouge_t5"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +gender_inclusive_sentences_german_bigbench = LightevalTaskConfig( + name="gender_inclusive_sentences_german", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="gender_inclusive_sentences_german", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +general_knowledge_bigbench = LightevalTaskConfig( + name="general_knowledge", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="general_knowledge", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +geometric_shapes_bigbench = LightevalTaskConfig( + name="geometric_shapes", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="geometric_shapes", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["rouge_t5", "bleu", "loglikelihood_acc", "perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +glue_cola_lighteval = LightevalTaskConfig( + name="glue:cola", + suite=["lighteval", "glue"], + prompt_function="cola", + hf_repo="glue", + hf_subset="cola", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc_single_token", "mcc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +glue_mnli_lighteval = LightevalTaskConfig( + name="glue:mnli", + suite=["lighteval", "glue"], + prompt_function="mnli", + hf_repo="glue", + hf_subset="mnli_matched", + hf_avail_splits=["train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +glue_mnli_mismatched_lighteval = LightevalTaskConfig( + name="glue:mnli_mismatched", + suite=["lighteval", "glue"], + prompt_function="mnli", + hf_repo="glue", + hf_subset="mnli_mismatched", + hf_avail_splits=["train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +glue_mrpc_lighteval = LightevalTaskConfig( + name="glue:mrpc", + suite=["lighteval", "glue"], + prompt_function="mrpc", + hf_repo="glue", + hf_subset="mrpc", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_f1"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +glue_qnli_lighteval = LightevalTaskConfig( + name="glue:qnli", + suite=["lighteval", "glue"], + prompt_function="qnli", + hf_repo="glue", + hf_subset="qnli", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +glue_qqp_lighteval = LightevalTaskConfig( + name="glue:qqp", + suite=["lighteval", "glue"], + prompt_function="qqp", + hf_repo="glue", + hf_subset="qqp", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_f1"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +glue_rte_lighteval = LightevalTaskConfig( + name="glue:rte", + suite=["lighteval", "glue"], + prompt_function="rte", + hf_repo="glue", + hf_subset="rte", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +glue_sst2_lighteval = LightevalTaskConfig( + name="glue:sst2", + suite=["lighteval", "glue"], + prompt_function="sst", + hf_repo="glue", + hf_subset="sst2", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +glue_stsb_lighteval = LightevalTaskConfig( + name="glue:stsb", + suite=["lighteval", "glue"], + prompt_function="stsb", + hf_repo="glue", + hf_subset="stsb", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +glue_wnli_lighteval = LightevalTaskConfig( + name="glue:wnli", + suite=["lighteval", "glue"], + prompt_function="wnli", + hf_repo="glue", + hf_subset="wnli", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +goal_step_wikihow_bigbench = LightevalTaskConfig( + name="goal_step_wikihow", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="goal_step_wikihow", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +gpqa_lighteval = LightevalTaskConfig( + name="gpqa", + suite=["lighteval"], + prompt_function="gpqa", + hf_repo="Idavidrein/gpqa", + hf_subset="gpqa_main", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +gre_reading_comprehension_bigbench = LightevalTaskConfig( + name="gre_reading_comprehension", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="gre_reading_comprehension", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +gsm8k_leaderboard = LightevalTaskConfig( + name="gsm8k", + suite=["leaderboard"], + prompt_function="gsm8k", + hf_repo="gsm8k", + hf_subset="main", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling_from_train", + generation_size=256, + metric=["quasi_exact_match_gsm8k"], + stop_sequence=["Question=", "Question", "="], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +gsm8k_lighteval = LightevalTaskConfig( + name="gsm8k", + suite=["lighteval"], + prompt_function="gsm8k", + hf_repo="gsm8k", + hf_subset="main", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling_from_train", + generation_size=256, + metric=["quasi_exact_match_gsm8k", "maj_at_8_gsm8k"], + stop_sequence=["Question="], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +headqa_en_lighteval = LightevalTaskConfig( + name="headqa:en", + suite=["lighteval", "headqa"], + prompt_function="headqa", + hf_repo="lighteval/headqa_harness", + hf_subset="en", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +headqa_es_lighteval = LightevalTaskConfig( + name="headqa:es", + suite=["lighteval", "headqa"], + prompt_function="headqa", + hf_repo="lighteval/headqa_harness", + hf_subset="es", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +hellaswag_leaderboard = LightevalTaskConfig( + name="hellaswag", + suite=["leaderboard"], + prompt_function="hellaswag_harness", + hf_repo="hellaswag", + hf_subset="default", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select="random_sampling_from_train", + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +hellaswag_helm = LightevalTaskConfig( + name="hellaswag", + suite=["helm", "helm_general"], + prompt_function="hellaswag_helm", + hf_repo="hellaswag", + hf_subset="default", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +hhh_alignment_bigbench = LightevalTaskConfig( + name="hhh_alignment", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="hhh_alignment", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +hindi_question_answering_bigbench = LightevalTaskConfig( + name="hindi_question_answering", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="hindi_question_answering", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["bleu", "rouge_t5", "perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +hindu_knowledge_bigbench_lite = LightevalTaskConfig( + name="hindu_knowledge", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench_linefeed_before_whitespace_after_query", + hf_repo="bigbench", + hf_subset="hindu_knowledge", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +hinglish_toxicity_bigbench = LightevalTaskConfig( + name="hinglish_toxicity", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="hinglish_toxicity", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +human_organs_senses_bigbench = LightevalTaskConfig( + name="human_organs_senses", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="human_organs_senses", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +humaneval_helm = LightevalTaskConfig( + name="humaneval", + suite=["helm", "code_scenario"], + prompt_function="humaneval", + hf_repo="openai_humaneval", + hf_subset="openai_humaneval", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=600, + metric=["code_humaneval"], + stop_sequence=["\nclass", "\ndef", "\nif", "\nprint"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +hyperbaton_bigbench = LightevalTaskConfig( + name="hyperbaton", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="hyperbaton", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +identify_math_theorems_bigbench = LightevalTaskConfig( + name="identify_math_theorems", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="identify_math_theorems", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +identify_odd_metaphor_bigbench = LightevalTaskConfig( + name="identify_odd_metaphor", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="identify_odd_metaphor", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +imdb_helm = LightevalTaskConfig( + name="imdb", + suite=["helm", "helm_general"], + prompt_function="imdb", + hf_repo="lighteval/IMDB_helm", + hf_subset="default", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "f1_score_macro", + "f1_score_micro", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +imdb_contrastset_helm = LightevalTaskConfig( + name="imdb:contrastset", + suite=["helm"], + prompt_function="imdb_contrastset", + hf_repo="lighteval/IMDB_helm", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "f1_score_macro", + "f1_score_micro", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +implicatures_bigbench = LightevalTaskConfig( + name="implicatures", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="implicatures", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +implicit_relations_bigbench = LightevalTaskConfig( + name="implicit_relations", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="implicit_relations", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +intent_recognition_bigbench = LightevalTaskConfig( + name="intent_recognition", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="intent_recognition", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +interactive_qa_mmlu_abstract_algebra_helm = LightevalTaskConfig( + name="interactive_qa_mmlu:abstract_algebra", + suite=["helm", "interactive_qa_mmlu_scenario"], + prompt_function="mmlu_qa_abstract_algebra", + hf_repo="lighteval/mmlu", + hf_subset="abstract_algebra", + hf_avail_splits=["dev", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +interactive_qa_mmlu_college_chemistry_helm = LightevalTaskConfig( + name="interactive_qa_mmlu:college_chemistry", + suite=["helm", "interactive_qa_mmlu_scenario"], + prompt_function="mmlu_qa_college_chemistry", + hf_repo="lighteval/mmlu", + hf_subset="college_chemistry", + hf_avail_splits=["dev", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +interactive_qa_mmlu_global_facts_helm = LightevalTaskConfig( + name="interactive_qa_mmlu:global_facts", + suite=["helm", "interactive_qa_mmlu_scenario"], + prompt_function="mmlu_qa_global_facts", + hf_repo="lighteval/mmlu", + hf_subset="global_facts", + hf_avail_splits=["dev", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +interactive_qa_mmlu_miscellaneous_helm = LightevalTaskConfig( + name="interactive_qa_mmlu:miscellaneous", + suite=["helm", "interactive_qa_mmlu_scenario"], + prompt_function="mmlu_qa_miscellaneous", + hf_repo="lighteval/mmlu", + hf_subset="miscellaneous", + hf_avail_splits=["dev", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +interactive_qa_mmlu_nutrition_helm = LightevalTaskConfig( + name="interactive_qa_mmlu:nutrition", + suite=["helm", "interactive_qa_mmlu_scenario"], + prompt_function="mmlu_qa_nutrition", + hf_repo="lighteval/mmlu", + hf_subset="nutrition", + hf_avail_splits=["dev", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +interactive_qa_mmlu_us_foreign_policy_helm = LightevalTaskConfig( + name="interactive_qa_mmlu:us_foreign_policy", + suite=["helm", "interactive_qa_mmlu_scenario"], + prompt_function="mmlu_qa_us_foreign_policy", + hf_repo="lighteval/mmlu", + hf_subset="us_foreign_policy", + hf_avail_splits=["dev", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +international_phonetic_alphabet_nli_bigbench = LightevalTaskConfig( + name="international_phonetic_alphabet_nli", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="international_phonetic_alphabet_nli", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +international_phonetic_alphabet_transliterate_bigbench = LightevalTaskConfig( + name="international_phonetic_alphabet_transliterate", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="international_phonetic_alphabet_transliterate", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["bleu", "rouge_t5", "perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +intersect_geometry_bigbench = LightevalTaskConfig( + name="intersect_geometry", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="intersect_geometry", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +irony_identification_bigbench = LightevalTaskConfig( + name="irony_identification", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="irony_identification", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +iwslt17_ar_en_lighteval = LightevalTaskConfig( + name="iwslt17:ar-en", + suite=["lighteval", "harness_selection"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="iwslt17_ar-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +iwslt17_de_en_lighteval = LightevalTaskConfig( + name="iwslt17:de-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="iwslt17_de-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +iwslt17_en_ar_lighteval = LightevalTaskConfig( + name="iwslt17:en-ar", + suite=["lighteval", "harness_selection"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="iwslt17_ar-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +iwslt17_en_de_lighteval = LightevalTaskConfig( + name="iwslt17:en-de", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="iwslt17_en-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +iwslt17_en_fr_lighteval = LightevalTaskConfig( + name="iwslt17:en-fr", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="iwslt17_en-fr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +iwslt17_en_ja_lighteval = LightevalTaskConfig( + name="iwslt17:en-ja", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="iwslt17_en-ja", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +iwslt17_en_ko_lighteval = LightevalTaskConfig( + name="iwslt17:en-ko", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="iwslt17_en-ko", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +iwslt17_en_zh_lighteval = LightevalTaskConfig( + name="iwslt17:en-zh", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="iwslt17_en-zh", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +iwslt17_fr_en_lighteval = LightevalTaskConfig( + name="iwslt17:fr-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="iwslt17_fr-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +iwslt17_ja_en_lighteval = LightevalTaskConfig( + name="iwslt17:ja-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="iwslt17_ja-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +iwslt17_ko_en_lighteval = LightevalTaskConfig( + name="iwslt17:ko-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="iwslt17_ko-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +iwslt17_zh_en_lighteval = LightevalTaskConfig( + name="iwslt17:zh-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="iwslt17_zh-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +kanji_ascii_bigbench = LightevalTaskConfig( + name="kanji_ascii", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="kanji_ascii", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +kannada_bigbench = LightevalTaskConfig( + name="kannada", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="kannada", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +key_value_maps_bigbench = LightevalTaskConfig( + name="key_value_maps", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="key_value_maps", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +known_unknowns_bigbench_lite = LightevalTaskConfig( + name="known_unknowns", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench_linefeed_before_whitespace_after_query", + hf_repo="bigbench", + hf_subset="known_unknowns", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lambada_standard_lighteval = LightevalTaskConfig( + name="lambada:standard", + suite=["lighteval", "lambada"], + prompt_function="lambada", + hf_repo="lambada", + hf_subset="plain_text", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=10, + metric=["target_perplexity"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lambada_standard_cloze_lighteval = LightevalTaskConfig( + name="lambada:standard_cloze", + suite=["lighteval", "lambada"], + prompt_function="lambada_cloze", + hf_repo="lambada", + hf_subset="plain_text", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=10, + metric=["target_perplexity"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lambada_openai_lighteval = LightevalTaskConfig( + name="lambada:openai", + suite=["lighteval", "lambada"], + prompt_function="lambada", + hf_repo="EleutherAI/lambada_openai", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=10, + metric=["target_perplexity"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lambada_openai_de_lighteval = LightevalTaskConfig( + name="lambada:openai:de", + suite=["lighteval", "lambada"], + prompt_function="lambada", + hf_repo="EleutherAI/lambada_openai", + hf_subset="de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=10, + metric=["target_perplexity"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lambada_openai_en_lighteval = LightevalTaskConfig( + name="lambada:openai:en", + suite=["lighteval", "lambada"], + prompt_function="lambada", + hf_repo="EleutherAI/lambada_openai", + hf_subset="en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=10, + metric=["target_perplexity"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lambada_openai_es_lighteval = LightevalTaskConfig( + name="lambada:openai:es", + suite=["lighteval", "lambada"], + prompt_function="lambada", + hf_repo="EleutherAI/lambada_openai", + hf_subset="es", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=10, + metric=["target_perplexity"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lambada_openai_fr_lighteval = LightevalTaskConfig( + name="lambada:openai:fr", + suite=["lighteval", "lambada"], + prompt_function="lambada", + hf_repo="EleutherAI/lambada_openai", + hf_subset="fr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=10, + metric=["target_perplexity"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lambada_openai_it_lighteval = LightevalTaskConfig( + name="lambada:openai:it", + suite=["lighteval", "lambada"], + prompt_function="lambada", + hf_repo="EleutherAI/lambada_openai", + hf_subset="it", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=10, + metric=["target_perplexity"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lambada_openai_cloze_lighteval = LightevalTaskConfig( + name="lambada:openai_cloze", + suite=["lighteval", "lambada"], + prompt_function="lambada_cloze", + hf_repo="EleutherAI/lambada_openai", + hf_subset="en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=10, + metric=["target_perplexity"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +language_games_bigbench = LightevalTaskConfig( + name="language_games", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="language_games", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["bleu", "rouge_t5", "perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +language_identification_bigbench_lite = LightevalTaskConfig( + name="language_identification", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="language_identification", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +legal_summarization_billsum_helm = LightevalTaskConfig( + name="legal_summarization:billsum", + suite=["helm"], + prompt_function="legal_summarization", + hf_repo="lighteval/legal_summarization", + hf_subset="BillSum", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=1024, + metric=["rouge1", "rouge2", "rougeL", "faithfulness", "extractiveness", "bert_score"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +legal_summarization_eurlexsum_helm = LightevalTaskConfig( + name="legal_summarization:eurlexsum", + suite=["helm"], + prompt_function="legal_summarization", + hf_repo="lighteval/legal_summarization", + hf_subset="EurLexSum", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=2048, + metric=["rouge1", "rouge2", "rougeL", "faithfulness", "extractiveness", "bert_score"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +legal_summarization_multilexsum_helm = LightevalTaskConfig( + name="legal_summarization:multilexsum", + suite=["helm"], + prompt_function="multilexsum", + hf_repo="lighteval/legal_summarization", + hf_subset="MultiLexSum", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=256, + metric=["rouge1", "rouge2", "rougeL", "faithfulness", "extractiveness", "bert_score"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +legalsupport_helm = LightevalTaskConfig( + name="legalsupport", + suite=["helm"], + prompt_function="legal_support", + hf_repo="lighteval/LegalSupport", + hf_subset="default", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["loglikelihood_acc", "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lexglue_case_hold_helm = LightevalTaskConfig( + name="lexglue:case_hold", + suite=["helm", "lex_glue_scenario"], + prompt_function="lex_glue_case_hold", + hf_repo="lighteval/lexglue", + hf_subset="case_hold", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lexglue_ecthr_a_helm = LightevalTaskConfig( + name="lexglue:ecthr_a", + suite=["helm", "lex_glue_scenario"], + prompt_function="lex_glue_ecthr_a", + hf_repo="lighteval/lexglue", + hf_subset="ecthr_a", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lexglue_ecthr_b_helm = LightevalTaskConfig( + name="lexglue:ecthr_b", + suite=["helm", "lex_glue_scenario"], + prompt_function="lex_glue_ecthr_b", + hf_repo="lighteval/lexglue", + hf_subset="ecthr_b", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lexglue_eurlex_helm = LightevalTaskConfig( + name="lexglue:eurlex", + suite=["helm", "lex_glue_scenario"], + prompt_function="lex_glue_eurlex", + hf_repo="lighteval/lexglue", + hf_subset="eurlex", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lexglue_ledgar_helm = LightevalTaskConfig( + name="lexglue:ledgar", + suite=["helm", "lex_glue_scenario"], + prompt_function="lex_glue_ledgar", + hf_repo="lighteval/lexglue", + hf_subset="ledgar", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lexglue_scotus_helm = LightevalTaskConfig( + name="lexglue:scotus", + suite=["helm", "lex_glue_scenario"], + prompt_function="lex_glue_scotus", + hf_repo="lighteval/lexglue", + hf_subset="scotus", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lexglue_unfair_tos_helm = LightevalTaskConfig( + name="lexglue:unfair_tos", + suite=["helm", "lex_glue_scenario"], + prompt_function="lex_glue_unfair_tos", + hf_repo="lighteval/lexglue", + hf_subset="unfair_tos", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lextreme_brazilian_court_decisions_judgment_helm = LightevalTaskConfig( + name="lextreme:brazilian_court_decisions_judgment", + suite=["helm", "lextreme_scenario"], + prompt_function="lextreme_brazilian_court_decisions_judgment", + hf_repo="lighteval/lextreme", + hf_subset="brazilian_court_decisions_judgment", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lextreme_brazilian_court_decisions_unanimity_helm = LightevalTaskConfig( + name="lextreme:brazilian_court_decisions_unanimity", + suite=["helm", "lextreme_scenario"], + prompt_function="lextreme_brazilian_court_decisions_unanimity", + hf_repo="lighteval/lextreme", + hf_subset="brazilian_court_decisions_unanimity", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lextreme_covid19_emergency_event_helm = LightevalTaskConfig( + name="lextreme:covid19_emergency_event", + suite=["helm", "lextreme_scenario"], + prompt_function="lextreme_covid19_emergency_event", + hf_repo="lighteval/lextreme", + hf_subset="covid19_emergency_event", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=10, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lextreme_german_argument_mining_helm = LightevalTaskConfig( + name="lextreme:german_argument_mining", + suite=["helm", "lextreme_scenario"], + prompt_function="lextreme_german_argument_mining", + hf_repo="lighteval/lextreme", + hf_subset="german_argument_mining", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lextreme_greek_legal_code_chapter_helm = LightevalTaskConfig( + name="lextreme:greek_legal_code_chapter", + suite=["helm", "lextreme_scenario"], + prompt_function="lextreme_greek_legal_code_chapter", + hf_repo="lighteval/lextreme", + hf_subset="greek_legal_code_chapter", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lextreme_greek_legal_code_subject_helm = LightevalTaskConfig( + name="lextreme:greek_legal_code_subject", + suite=["helm", "lextreme_scenario"], + prompt_function="lextreme_greek_legal_code_subject", + hf_repo="lighteval/lextreme", + hf_subset="greek_legal_code_subject", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lextreme_greek_legal_code_volume_helm = LightevalTaskConfig( + name="lextreme:greek_legal_code_volume", + suite=["helm", "lextreme_scenario"], + prompt_function="lextreme_greek_legal_code_volume", + hf_repo="lighteval/lextreme", + hf_subset="greek_legal_code_volume", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=["exact_match", "quasi_exact_match", "f1_score"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lextreme_greek_legal_ner_helm = LightevalTaskConfig( + name="lextreme:greek_legal_ner", + suite=["helm", "lextreme_scenario"], + prompt_function="lextreme_greek_legal_ner", + hf_repo="lighteval/lextreme", + hf_subset="greek_legal_ner", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=430, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lextreme_legalnero_helm = LightevalTaskConfig( + name="lextreme:legalnero", + suite=["helm", "lextreme_scenario"], + prompt_function="lextreme_legalnero", + hf_repo="lighteval/lextreme", + hf_subset="legalnero", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=788, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lextreme_lener_br_helm = LightevalTaskConfig( + name="lextreme:lener_br", + suite=["helm", "lextreme_scenario"], + prompt_function="lextreme_lener_br", + hf_repo="lighteval/lextreme", + hf_subset="lener_br", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=338, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lextreme_mapa_coarse_helm = LightevalTaskConfig( + name="lextreme:mapa_coarse", + suite=["helm", "lextreme_scenario"], + prompt_function="lextreme_mapa_coarse", + hf_repo="lighteval/lextreme", + hf_subset="mapa_coarse", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=274, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lextreme_mapa_fine_helm = LightevalTaskConfig( + name="lextreme:mapa_fine", + suite=["helm", "lextreme_scenario"], + prompt_function="lextreme_mapa_fine", + hf_repo="lighteval/lextreme", + hf_subset="mapa_fine", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=274, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lextreme_multi_eurlex_level_1_helm = LightevalTaskConfig( + name="lextreme:multi_eurlex_level_1", + suite=["helm", "lextreme_scenario"], + prompt_function="lextreme_multi_eurlex_level_1", + hf_repo="lighteval/lextreme", + hf_subset="multi_eurlex_level_1", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=10, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lextreme_multi_eurlex_level_2_helm = LightevalTaskConfig( + name="lextreme:multi_eurlex_level_2", + suite=["helm", "lextreme_scenario"], + prompt_function="lextreme_multi_eurlex_level_2", + hf_repo="lighteval/lextreme", + hf_subset="multi_eurlex_level_2", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=10, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lextreme_multi_eurlex_level_3_helm = LightevalTaskConfig( + name="lextreme:multi_eurlex_level_3", + suite=["helm", "lextreme_scenario"], + prompt_function="lextreme_multi_eurlex_level_3", + hf_repo="lighteval/lextreme", + hf_subset="multi_eurlex_level_3", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=10, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lextreme_online_terms_of_service_clause_topics_helm = LightevalTaskConfig( + name="lextreme:online_terms_of_service_clause_topics", + suite=["helm", "lextreme_scenario"], + prompt_function="lextreme_online_terms_of_service_clause_topics", + hf_repo="lighteval/lextreme", + hf_subset="online_terms_of_service_clause_topics", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=10, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lextreme_online_terms_of_service_unfairness_levels_helm = LightevalTaskConfig( + name="lextreme:online_terms_of_service_unfairness_levels", + suite=["helm", "lextreme_scenario"], + prompt_function="lextreme_online_terms_of_service_unfairness_levels", + hf_repo="lighteval/lextreme", + hf_subset="online_terms_of_service_unfairness_levels", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=10, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lextreme_swiss_judgment_prediction_helm = LightevalTaskConfig( + name="lextreme:swiss_judgment_prediction", + suite=["helm", "lextreme_scenario"], + prompt_function="lextreme_swiss_judgment_prediction", + hf_repo="lighteval/lextreme", + hf_subset="swiss_judgment_prediction", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +linguistic_mappings_bigbench = LightevalTaskConfig( + name="linguistic_mappings", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="linguistic_mappings", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +linguistics_puzzles_bigbench_lite = LightevalTaskConfig( + name="linguistics_puzzles", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench_whitespace_after_query", + hf_repo="bigbench", + hf_subset="linguistics_puzzles", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["bleu", "rouge_t5", "perfect_exact_match"], + stop_sequence=None, + output_regex="[^\\.\\?\\!\\;\\n]+", + trust_dataset=True, + version=0, +) +logic_grid_puzzle_bigbench_lite = LightevalTaskConfig( + name="logic_grid_puzzle", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="logic_grid_puzzle", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +logical_args_bigbench = LightevalTaskConfig( + name="logical_args", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="logical_args", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +logical_deduction_bigbench_lite = LightevalTaskConfig( + name="logical_deduction", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench_whitespace_after_query", + hf_repo="bigbench", + hf_subset="logical_deduction", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +logical_fallacy_detection_bigbench = LightevalTaskConfig( + name="logical_fallacy_detection", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="logical_fallacy_detection", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +logical_sequence_bigbench = LightevalTaskConfig( + name="logical_sequence", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="logical_sequence", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +logiqa_lighteval = LightevalTaskConfig( + name="logiqa", + suite=["lighteval"], + prompt_function="logiqa", + hf_repo="lighteval/logiqa_harness", + hf_subset="logiqa", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lsat_qa_helm = LightevalTaskConfig( + name="lsat_qa", + suite=["helm", "lsat_qa_scenario"], + prompt_function="lsat_qa", + hf_repo="lighteval/lsat_qa", + hf_subset="all", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lsat_qa_assignment_helm = LightevalTaskConfig( + name="lsat_qa:assignment", + suite=["helm", "lsat_qa_scenario"], + prompt_function="lsat_qa", + hf_repo="lighteval/lsat_qa", + hf_subset="assignment", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lsat_qa_grouping_helm = LightevalTaskConfig( + name="lsat_qa:grouping", + suite=["helm", "lsat_qa_scenario"], + prompt_function="lsat_qa", + hf_repo="lighteval/lsat_qa", + hf_subset="grouping", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lsat_qa_miscellaneous_helm = LightevalTaskConfig( + name="lsat_qa:miscellaneous", + suite=["helm", "lsat_qa_scenario"], + prompt_function="lsat_qa", + hf_repo="lighteval/lsat_qa", + hf_subset="miscellaneous", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lsat_qa_ordering_helm = LightevalTaskConfig( + name="lsat_qa:ordering", + suite=["helm", "lsat_qa_scenario"], + prompt_function="lsat_qa", + hf_repo="lighteval/lsat_qa", + hf_subset="ordering", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +math_algebra_lighteval = LightevalTaskConfig( + name="math:algebra", + suite=["lighteval", "math"], + prompt_function="math", + hf_repo="lighteval/MATH", + hf_subset="algebra", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=2048, + metric=["quasi_exact_match_math", "maj_at_4_math"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=1, +) +math_counting_and_probability_lighteval = LightevalTaskConfig( + name="math:counting_and_probability", + suite=["lighteval", "math"], + prompt_function="math", + hf_repo="lighteval/MATH", + hf_subset="counting_and_probability", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=2048, + metric=["quasi_exact_match_math", "maj_at_4_math"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=1, +) +math_geometry_lighteval = LightevalTaskConfig( + name="math:geometry", + suite=["lighteval", "math"], + prompt_function="math", + hf_repo="lighteval/MATH", + hf_subset="geometry", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=2048, + metric=["quasi_exact_match_math", "maj_at_4_math"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=1, +) +math_intermediate_algebra_lighteval = LightevalTaskConfig( + name="math:intermediate_algebra", + suite=["lighteval", "math"], + prompt_function="math", + hf_repo="lighteval/MATH", + hf_subset="intermediate_algebra", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=2048, + metric=["quasi_exact_match_math", "maj_at_4_math"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=1, +) +math_number_theory_lighteval = LightevalTaskConfig( + name="math:number_theory", + suite=["lighteval", "math"], + prompt_function="math", + hf_repo="lighteval/MATH", + hf_subset="number_theory", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=2048, + metric=["quasi_exact_match_math", "maj_at_4_math"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=1, +) +math_prealgebra_lighteval = LightevalTaskConfig( + name="math:prealgebra", + suite=["lighteval", "math"], + prompt_function="math", + hf_repo="lighteval/MATH", + hf_subset="prealgebra", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=2048, + metric=["quasi_exact_match_math", "maj_at_4_math"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=1, +) +math_precalculus_lighteval = LightevalTaskConfig( + name="math:precalculus", + suite=["lighteval", "math"], + prompt_function="math", + hf_repo="lighteval/MATH", + hf_subset="precalculus", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=2048, + metric=["quasi_exact_match_math", "maj_at_4_math"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=1, +) +math_cot_algebra_lighteval = LightevalTaskConfig( + name="math_cot:algebra", + suite=["lighteval", "math"], + prompt_function="math_cot", + hf_repo="lighteval/MATH", + hf_subset="algebra", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=2048, + metric=["quasi_exact_match_math", "maj_at_4_math"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +math_cot_counting_and_probability_lighteval = LightevalTaskConfig( + name="math_cot:counting_and_probability", + suite=["lighteval", "math"], + prompt_function="math_cot", + hf_repo="lighteval/MATH", + hf_subset="counting_and_probability", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=2048, + metric=["quasi_exact_match_math", "maj_at_4_math"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +math_cot_geometry_lighteval = LightevalTaskConfig( + name="math_cot:geometry", + suite=["lighteval", "math"], + prompt_function="math_cot", + hf_repo="lighteval/MATH", + hf_subset="geometry", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=2048, + metric=["quasi_exact_match_math", "maj_at_4_math"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +math_cot_intermediate_algebra_lighteval = LightevalTaskConfig( + name="math_cot:intermediate_algebra", + suite=["lighteval", "math"], + prompt_function="math_cot", + hf_repo="lighteval/MATH", + hf_subset="intermediate_algebra", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=2048, + metric=["quasi_exact_match_math", "maj_at_4_math"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +math_cot_number_theory_lighteval = LightevalTaskConfig( + name="math_cot:number_theory", + suite=["lighteval", "math"], + prompt_function="math_cot", + hf_repo="lighteval/MATH", + hf_subset="number_theory", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=2048, + metric=["quasi_exact_match_math", "maj_at_4_math"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +math_cot_prealgebra_lighteval = LightevalTaskConfig( + name="math_cot:prealgebra", + suite=["lighteval", "math"], + prompt_function="math_cot", + hf_repo="lighteval/MATH", + hf_subset="prealgebra", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=2048, + metric=["quasi_exact_match_math", "maj_at_4_math"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +math_cot_precalculus_lighteval = LightevalTaskConfig( + name="math_cot:precalculus", + suite=["lighteval", "math"], + prompt_function="math_cot", + hf_repo="lighteval/MATH", + hf_subset="precalculus", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=2048, + metric=["quasi_exact_match_math", "maj_at_4_math"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mathematical_induction_bigbench = LightevalTaskConfig( + name="mathematical_induction", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="mathematical_induction", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mathqa_lighteval = LightevalTaskConfig( + name="mathqa", + suite=["lighteval"], + prompt_function="mathqa", + hf_repo="math_qa", + hf_subset="default", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +matrixshapes_bigbench = LightevalTaskConfig( + name="matrixshapes", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="matrixshapes", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +me_q_sum_helm = LightevalTaskConfig( + name="me_q_sum", + suite=["helm"], + prompt_function="me_q_sum", + hf_repo="lighteval/me_q_sum", + hf_subset="default", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=["exact_match", "quasi_exact_match", "f1_score", "rougeL", "bleu_1", "bleu_4"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +med_dialog_healthcaremagic_helm = LightevalTaskConfig( + name="med_dialog:healthcaremagic", + suite=["helm"], + prompt_function="med_dialog", + hf_repo="lighteval/med_dialog", + hf_subset="healthcaremagic", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=["exact_match", "quasi_exact_match", "f1_score", "rougeL", "bleu_1", "bleu_4"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +med_dialog_icliniq_helm = LightevalTaskConfig( + name="med_dialog:icliniq", + suite=["helm"], + prompt_function="med_dialog", + hf_repo="lighteval/med_dialog", + hf_subset="icliniq", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=["exact_match", "quasi_exact_match", "f1_score", "rougeL", "bleu_1", "bleu_4"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +med_mcqa_helm = LightevalTaskConfig( + name="med_mcqa", + suite=["helm"], + prompt_function="med_mcqa", + hf_repo="lighteval/med_mcqa", + hf_subset="default", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["loglikelihood_acc", "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +med_paragraph_simplification_helm = LightevalTaskConfig( + name="med_paragraph_simplification", + suite=["helm"], + prompt_function="med_paragraph_simplification", + hf_repo="lighteval/med_paragraph_simplification", + hf_subset="default", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=512, + metric=["exact_match", "quasi_exact_match", "f1_score", "rougeL", "bleu_1", "bleu_4"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +med_qa_helm = LightevalTaskConfig( + name="med_qa", + suite=["helm"], + prompt_function="med_qa", + hf_repo="bigbio/med_qa", + hf_subset="med_qa_en_source", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["loglikelihood_acc", "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +metaphor_boolean_bigbench = LightevalTaskConfig( + name="metaphor_boolean", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="metaphor_boolean", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +metaphor_understanding_bigbench = LightevalTaskConfig( + name="metaphor_understanding", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="metaphor_understanding", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mgsm_en_lighteval = LightevalTaskConfig( + name="mgsm:en", + suite=["lighteval"], + prompt_function="mgsm_en", + hf_repo="juletxara/mgsm", + hf_subset="en", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n", "=", "Question="], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mgsm_es_lighteval = LightevalTaskConfig( + name="mgsm:es", + suite=["lighteval"], + prompt_function="mgsm_es", + hf_repo="juletxara/mgsm", + hf_subset="es", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n", "=", "Pregunta="], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mgsm_fr_lighteval = LightevalTaskConfig( + name="mgsm:fr", + suite=["lighteval"], + prompt_function="mgsm_fr", + hf_repo="juletxara/mgsm", + hf_subset="fr", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n", "=", "Question="], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mgsm_de_lighteval = LightevalTaskConfig( + name="mgsm:de", + suite=["lighteval"], + prompt_function="mgsm_de", + hf_repo="juletxara/mgsm", + hf_subset="de", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n", "=", "Frage="], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mgsm_ru_lighteval = LightevalTaskConfig( + name="mgsm:ru", + suite=["lighteval"], + prompt_function="mgsm_ru", + hf_repo="juletxara/mgsm", + hf_subset="ru", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n", "=", "\u0417\u0430\u0434\u0430\u0447\u0430="], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mgsm_zh_lighteval = LightevalTaskConfig( + name="mgsm:zh", + suite=["lighteval"], + prompt_function="mgsm_zh", + hf_repo="juletxara/mgsm", + hf_subset="zh", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n", "=", "\u95ee\u9898="], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mgsm_ja_lighteval = LightevalTaskConfig( + name="mgsm:ja", + suite=["lighteval"], + prompt_function="mgsm_ja", + hf_repo="juletxara/mgsm", + hf_subset="ja", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n", "=", "\u554f\u984c="], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mgsm_th_lighteval = LightevalTaskConfig( + name="mgsm:th", + suite=["lighteval"], + prompt_function="mgsm_th", + hf_repo="juletxara/mgsm", + hf_subset="th", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n", "=", "\u0e42\u0e08\u0e17\u0e22\u0e4c="], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mgsm_sw_lighteval = LightevalTaskConfig( + name="mgsm:sw", + suite=["lighteval"], + prompt_function="mgsm_sw", + hf_repo="juletxara/mgsm", + hf_subset="sw", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n", "=", "Swali="], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mgsm_bn_lighteval = LightevalTaskConfig( + name="mgsm:bn", + suite=["lighteval"], + prompt_function="mgsm_bn", + hf_repo="juletxara/mgsm", + hf_subset="bn", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n", "=", "\u09aa\u09cd\u09b0\u09b6\u09cd\u09a8="], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mgsm_te_lighteval = LightevalTaskConfig( + name="mgsm:te", + suite=["lighteval"], + prompt_function="mgsm_te", + hf_repo="juletxara/mgsm", + hf_subset="te", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n", "=", "\u0c2a\u0c4d\u0c30\u0c36\u0c4d\u0c28="], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +minute_mysteries_qa_bigbench = LightevalTaskConfig( + name="minute_mysteries_qa", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="minute_mysteries_qa", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "rouge_t5"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +misconceptions_bigbench = LightevalTaskConfig( + name="misconceptions", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="misconceptions", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +misconceptions_russian_bigbench_lite = LightevalTaskConfig( + name="misconceptions_russian", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="misconceptions_russian", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_helm = LightevalTaskConfig( + name="mmlu", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="all", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_original = LightevalTaskConfig( + name="mmlu", + suite=["original"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="all", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=5, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_abstract_algebra_original = LightevalTaskConfig( + name="mmlu:abstract_algebra", + suite=["original", "mmlu"], + prompt_function="mmlu_abstract_algebra", + hf_repo="cais/mmlu", + hf_subset="abstract_algebra", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_abstract_algebra_leaderboard = LightevalTaskConfig( + name="mmlu:abstract_algebra", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="abstract_algebra", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_abstract_algebra_helm = LightevalTaskConfig( + name="mmlu:abstract_algebra", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="abstract_algebra", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_anatomy_original = LightevalTaskConfig( + name="mmlu:anatomy", + suite=["original", "mmlu"], + prompt_function="mmlu_anatomy", + hf_repo="cais/mmlu", + hf_subset="anatomy", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_anatomy_leaderboard = LightevalTaskConfig( + name="mmlu:anatomy", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="anatomy", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_anatomy_helm = LightevalTaskConfig( + name="mmlu:anatomy", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="anatomy", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_astronomy_original = LightevalTaskConfig( + name="mmlu:astronomy", + suite=["original", "mmlu"], + prompt_function="mmlu_astronomy", + hf_repo="cais/mmlu", + hf_subset="astronomy", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_astronomy_leaderboard = LightevalTaskConfig( + name="mmlu:astronomy", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="astronomy", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_astronomy_helm = LightevalTaskConfig( + name="mmlu:astronomy", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="astronomy", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_business_ethics_original = LightevalTaskConfig( + name="mmlu:business_ethics", + suite=["original", "mmlu"], + prompt_function="mmlu_business_ethics", + hf_repo="cais/mmlu", + hf_subset="business_ethics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_business_ethics_leaderboard = LightevalTaskConfig( + name="mmlu:business_ethics", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="business_ethics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_business_ethics_helm = LightevalTaskConfig( + name="mmlu:business_ethics", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="business_ethics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_clinical_knowledge_original = LightevalTaskConfig( + name="mmlu:clinical_knowledge", + suite=["original", "mmlu"], + prompt_function="mmlu_clinical_knowledge", + hf_repo="cais/mmlu", + hf_subset="clinical_knowledge", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_clinical_knowledge_leaderboard = LightevalTaskConfig( + name="mmlu:clinical_knowledge", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="clinical_knowledge", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_clinical_knowledge_helm = LightevalTaskConfig( + name="mmlu:clinical_knowledge", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="clinical_knowledge", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_college_biology_original = LightevalTaskConfig( + name="mmlu:college_biology", + suite=["original", "mmlu"], + prompt_function="mmlu_college_biology", + hf_repo="cais/mmlu", + hf_subset="college_biology", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_college_biology_leaderboard = LightevalTaskConfig( + name="mmlu:college_biology", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="college_biology", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_college_biology_helm = LightevalTaskConfig( + name="mmlu:college_biology", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="college_biology", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_college_chemistry_original = LightevalTaskConfig( + name="mmlu:college_chemistry", + suite=["original", "mmlu"], + prompt_function="mmlu_college_chemistry", + hf_repo="cais/mmlu", + hf_subset="college_chemistry", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_college_chemistry_leaderboard = LightevalTaskConfig( + name="mmlu:college_chemistry", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="college_chemistry", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_college_chemistry_helm = LightevalTaskConfig( + name="mmlu:college_chemistry", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="college_chemistry", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_college_computer_science_original = LightevalTaskConfig( + name="mmlu:college_computer_science", + suite=["original", "mmlu"], + prompt_function="mmlu_college_computer_science", + hf_repo="cais/mmlu", + hf_subset="college_computer_science", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_college_computer_science_leaderboard = LightevalTaskConfig( + name="mmlu:college_computer_science", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="college_computer_science", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_college_computer_science_helm = LightevalTaskConfig( + name="mmlu:college_computer_science", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="college_computer_science", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_college_mathematics_original = LightevalTaskConfig( + name="mmlu:college_mathematics", + suite=["original", "mmlu"], + prompt_function="mmlu_college_mathematics", + hf_repo="cais/mmlu", + hf_subset="college_mathematics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_college_mathematics_leaderboard = LightevalTaskConfig( + name="mmlu:college_mathematics", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="college_mathematics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_college_mathematics_helm = LightevalTaskConfig( + name="mmlu:college_mathematics", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="college_mathematics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_college_medicine_original = LightevalTaskConfig( + name="mmlu:college_medicine", + suite=["original", "mmlu"], + prompt_function="mmlu_college_medicine", + hf_repo="cais/mmlu", + hf_subset="college_medicine", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_college_medicine_leaderboard = LightevalTaskConfig( + name="mmlu:college_medicine", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="college_medicine", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_college_medicine_helm = LightevalTaskConfig( + name="mmlu:college_medicine", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="college_medicine", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_college_physics_original = LightevalTaskConfig( + name="mmlu:college_physics", + suite=["original", "mmlu"], + prompt_function="mmlu_college_physics", + hf_repo="cais/mmlu", + hf_subset="college_physics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_college_physics_leaderboard = LightevalTaskConfig( + name="mmlu:college_physics", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="college_physics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_college_physics_helm = LightevalTaskConfig( + name="mmlu:college_physics", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="college_physics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_computer_security_original = LightevalTaskConfig( + name="mmlu:computer_security", + suite=["original", "mmlu"], + prompt_function="mmlu_computer_security", + hf_repo="cais/mmlu", + hf_subset="computer_security", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_computer_security_leaderboard = LightevalTaskConfig( + name="mmlu:computer_security", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="computer_security", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_computer_security_helm = LightevalTaskConfig( + name="mmlu:computer_security", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="computer_security", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_conceptual_physics_original = LightevalTaskConfig( + name="mmlu:conceptual_physics", + suite=["original", "mmlu"], + prompt_function="mmlu_conceptual_physics", + hf_repo="cais/mmlu", + hf_subset="conceptual_physics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_conceptual_physics_leaderboard = LightevalTaskConfig( + name="mmlu:conceptual_physics", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="conceptual_physics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_conceptual_physics_helm = LightevalTaskConfig( + name="mmlu:conceptual_physics", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="conceptual_physics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_econometrics_original = LightevalTaskConfig( + name="mmlu:econometrics", + suite=["original", "mmlu"], + prompt_function="mmlu_econometrics", + hf_repo="cais/mmlu", + hf_subset="econometrics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_econometrics_leaderboard = LightevalTaskConfig( + name="mmlu:econometrics", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="econometrics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_econometrics_helm = LightevalTaskConfig( + name="mmlu:econometrics", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="econometrics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_electrical_engineering_original = LightevalTaskConfig( + name="mmlu:electrical_engineering", + suite=["original", "mmlu"], + prompt_function="mmlu_electrical_engineering", + hf_repo="cais/mmlu", + hf_subset="electrical_engineering", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_electrical_engineering_leaderboard = LightevalTaskConfig( + name="mmlu:electrical_engineering", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="electrical_engineering", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_electrical_engineering_helm = LightevalTaskConfig( + name="mmlu:electrical_engineering", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="electrical_engineering", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_elementary_mathematics_original = LightevalTaskConfig( + name="mmlu:elementary_mathematics", + suite=["original", "mmlu"], + prompt_function="mmlu_elementary_mathematics", + hf_repo="cais/mmlu", + hf_subset="elementary_mathematics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_elementary_mathematics_leaderboard = LightevalTaskConfig( + name="mmlu:elementary_mathematics", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="elementary_mathematics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_elementary_mathematics_helm = LightevalTaskConfig( + name="mmlu:elementary_mathematics", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="elementary_mathematics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_formal_logic_original = LightevalTaskConfig( + name="mmlu:formal_logic", + suite=["original", "mmlu"], + prompt_function="mmlu_formal_logic", + hf_repo="cais/mmlu", + hf_subset="formal_logic", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_formal_logic_leaderboard = LightevalTaskConfig( + name="mmlu:formal_logic", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="formal_logic", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_formal_logic_helm = LightevalTaskConfig( + name="mmlu:formal_logic", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="formal_logic", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_global_facts_original = LightevalTaskConfig( + name="mmlu:global_facts", + suite=["original", "mmlu"], + prompt_function="mmlu_global_facts", + hf_repo="cais/mmlu", + hf_subset="global_facts", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_global_facts_leaderboard = LightevalTaskConfig( + name="mmlu:global_facts", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="global_facts", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_global_facts_helm = LightevalTaskConfig( + name="mmlu:global_facts", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="global_facts", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_biology_original = LightevalTaskConfig( + name="mmlu:high_school_biology", + suite=["original", "mmlu"], + prompt_function="mmlu_high_school_biology", + hf_repo="cais/mmlu", + hf_subset="high_school_biology", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_biology_leaderboard = LightevalTaskConfig( + name="mmlu:high_school_biology", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="high_school_biology", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_biology_helm = LightevalTaskConfig( + name="mmlu:high_school_biology", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="high_school_biology", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_chemistry_original = LightevalTaskConfig( + name="mmlu:high_school_chemistry", + suite=["original", "mmlu"], + prompt_function="mmlu_high_school_chemistry", + hf_repo="cais/mmlu", + hf_subset="high_school_chemistry", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_chemistry_leaderboard = LightevalTaskConfig( + name="mmlu:high_school_chemistry", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="high_school_chemistry", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_chemistry_helm = LightevalTaskConfig( + name="mmlu:high_school_chemistry", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="high_school_chemistry", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_computer_science_original = LightevalTaskConfig( + name="mmlu:high_school_computer_science", + suite=["original", "mmlu"], + prompt_function="mmlu_high_school_computer_science", + hf_repo="cais/mmlu", + hf_subset="high_school_computer_science", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_computer_science_leaderboard = LightevalTaskConfig( + name="mmlu:high_school_computer_science", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="high_school_computer_science", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_computer_science_helm = LightevalTaskConfig( + name="mmlu:high_school_computer_science", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="high_school_computer_science", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_european_history_original = LightevalTaskConfig( + name="mmlu:high_school_european_history", + suite=["original", "mmlu"], + prompt_function="mmlu_high_school_european_history", + hf_repo="cais/mmlu", + hf_subset="high_school_european_history", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_european_history_leaderboard = LightevalTaskConfig( + name="mmlu:high_school_european_history", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="high_school_european_history", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_european_history_helm = LightevalTaskConfig( + name="mmlu:high_school_european_history", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="high_school_european_history", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_geography_original = LightevalTaskConfig( + name="mmlu:high_school_geography", + suite=["original", "mmlu"], + prompt_function="mmlu_high_school_geography", + hf_repo="cais/mmlu", + hf_subset="high_school_geography", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_geography_leaderboard = LightevalTaskConfig( + name="mmlu:high_school_geography", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="high_school_geography", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_geography_helm = LightevalTaskConfig( + name="mmlu:high_school_geography", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="high_school_geography", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_government_and_politics_original = LightevalTaskConfig( + name="mmlu:high_school_government_and_politics", + suite=["original", "mmlu"], + prompt_function="mmlu_high_school_government_and_politics", + hf_repo="cais/mmlu", + hf_subset="high_school_government_and_politics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_government_and_politics_leaderboard = LightevalTaskConfig( + name="mmlu:high_school_government_and_politics", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="high_school_government_and_politics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_government_and_politics_helm = LightevalTaskConfig( + name="mmlu:high_school_government_and_politics", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="high_school_government_and_politics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_macroeconomics_original = LightevalTaskConfig( + name="mmlu:high_school_macroeconomics", + suite=["original", "mmlu"], + prompt_function="mmlu_high_school_macroeconomics", + hf_repo="cais/mmlu", + hf_subset="high_school_macroeconomics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_macroeconomics_leaderboard = LightevalTaskConfig( + name="mmlu:high_school_macroeconomics", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="high_school_macroeconomics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_macroeconomics_helm = LightevalTaskConfig( + name="mmlu:high_school_macroeconomics", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="high_school_macroeconomics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_mathematics_original = LightevalTaskConfig( + name="mmlu:high_school_mathematics", + suite=["original", "mmlu"], + prompt_function="mmlu_high_school_mathematics", + hf_repo="cais/mmlu", + hf_subset="high_school_mathematics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_mathematics_leaderboard = LightevalTaskConfig( + name="mmlu:high_school_mathematics", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="high_school_mathematics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_mathematics_helm = LightevalTaskConfig( + name="mmlu:high_school_mathematics", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="high_school_mathematics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_microeconomics_original = LightevalTaskConfig( + name="mmlu:high_school_microeconomics", + suite=["original", "mmlu"], + prompt_function="mmlu_high_school_microeconomics", + hf_repo="cais/mmlu", + hf_subset="high_school_microeconomics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_microeconomics_leaderboard = LightevalTaskConfig( + name="mmlu:high_school_microeconomics", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="high_school_microeconomics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_microeconomics_helm = LightevalTaskConfig( + name="mmlu:high_school_microeconomics", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="high_school_microeconomics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_physics_original = LightevalTaskConfig( + name="mmlu:high_school_physics", + suite=["original", "mmlu"], + prompt_function="mmlu_high_school_physics", + hf_repo="cais/mmlu", + hf_subset="high_school_physics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_physics_leaderboard = LightevalTaskConfig( + name="mmlu:high_school_physics", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="high_school_physics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_physics_helm = LightevalTaskConfig( + name="mmlu:high_school_physics", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="high_school_physics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_psychology_original = LightevalTaskConfig( + name="mmlu:high_school_psychology", + suite=["original", "mmlu"], + prompt_function="mmlu_high_school_psychology", + hf_repo="cais/mmlu", + hf_subset="high_school_psychology", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_psychology_leaderboard = LightevalTaskConfig( + name="mmlu:high_school_psychology", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="high_school_psychology", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_psychology_helm = LightevalTaskConfig( + name="mmlu:high_school_psychology", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="high_school_psychology", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_statistics_original = LightevalTaskConfig( + name="mmlu:high_school_statistics", + suite=["original", "mmlu"], + prompt_function="mmlu_high_school_statistics", + hf_repo="cais/mmlu", + hf_subset="high_school_statistics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_statistics_leaderboard = LightevalTaskConfig( + name="mmlu:high_school_statistics", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="high_school_statistics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_statistics_helm = LightevalTaskConfig( + name="mmlu:high_school_statistics", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="high_school_statistics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_us_history_original = LightevalTaskConfig( + name="mmlu:high_school_us_history", + suite=["original", "mmlu"], + prompt_function="mmlu_high_school_us_history", + hf_repo="cais/mmlu", + hf_subset="high_school_us_history", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_us_history_leaderboard = LightevalTaskConfig( + name="mmlu:high_school_us_history", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="high_school_us_history", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_us_history_helm = LightevalTaskConfig( + name="mmlu:high_school_us_history", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="high_school_us_history", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_world_history_original = LightevalTaskConfig( + name="mmlu:high_school_world_history", + suite=["original", "mmlu"], + prompt_function="mmlu_high_school_world_history", + hf_repo="cais/mmlu", + hf_subset="high_school_world_history", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_world_history_leaderboard = LightevalTaskConfig( + name="mmlu:high_school_world_history", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="high_school_world_history", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_world_history_helm = LightevalTaskConfig( + name="mmlu:high_school_world_history", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="high_school_world_history", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_human_aging_original = LightevalTaskConfig( + name="mmlu:human_aging", + suite=["original", "mmlu"], + prompt_function="mmlu_human_aging", + hf_repo="cais/mmlu", + hf_subset="human_aging", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_human_aging_leaderboard = LightevalTaskConfig( + name="mmlu:human_aging", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="human_aging", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_human_aging_helm = LightevalTaskConfig( + name="mmlu:human_aging", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="human_aging", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_human_sexuality_original = LightevalTaskConfig( + name="mmlu:human_sexuality", + suite=["original", "mmlu"], + prompt_function="mmlu_human_sexuality", + hf_repo="cais/mmlu", + hf_subset="human_sexuality", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_human_sexuality_leaderboard = LightevalTaskConfig( + name="mmlu:human_sexuality", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="human_sexuality", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_human_sexuality_helm = LightevalTaskConfig( + name="mmlu:human_sexuality", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="human_sexuality", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_international_law_original = LightevalTaskConfig( + name="mmlu:international_law", + suite=["original", "mmlu"], + prompt_function="mmlu_international_law", + hf_repo="cais/mmlu", + hf_subset="international_law", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_international_law_leaderboard = LightevalTaskConfig( + name="mmlu:international_law", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="international_law", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_international_law_helm = LightevalTaskConfig( + name="mmlu:international_law", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="international_law", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_jurisprudence_original = LightevalTaskConfig( + name="mmlu:jurisprudence", + suite=["original", "mmlu"], + prompt_function="mmlu_jurisprudence", + hf_repo="cais/mmlu", + hf_subset="jurisprudence", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_jurisprudence_leaderboard = LightevalTaskConfig( + name="mmlu:jurisprudence", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="jurisprudence", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_jurisprudence_helm = LightevalTaskConfig( + name="mmlu:jurisprudence", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="jurisprudence", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_logical_fallacies_original = LightevalTaskConfig( + name="mmlu:logical_fallacies", + suite=["original", "mmlu"], + prompt_function="mmlu_logical_fallacies", + hf_repo="cais/mmlu", + hf_subset="logical_fallacies", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_logical_fallacies_leaderboard = LightevalTaskConfig( + name="mmlu:logical_fallacies", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="logical_fallacies", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_logical_fallacies_helm = LightevalTaskConfig( + name="mmlu:logical_fallacies", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="logical_fallacies", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_machine_learning_original = LightevalTaskConfig( + name="mmlu:machine_learning", + suite=["original", "mmlu"], + prompt_function="mmlu_machine_learning", + hf_repo="cais/mmlu", + hf_subset="machine_learning", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_machine_learning_leaderboard = LightevalTaskConfig( + name="mmlu:machine_learning", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="machine_learning", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_machine_learning_helm = LightevalTaskConfig( + name="mmlu:machine_learning", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="machine_learning", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_management_original = LightevalTaskConfig( + name="mmlu:management", + suite=["original", "mmlu"], + prompt_function="mmlu_management", + hf_repo="cais/mmlu", + hf_subset="management", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_management_leaderboard = LightevalTaskConfig( + name="mmlu:management", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="management", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_management_helm = LightevalTaskConfig( + name="mmlu:management", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="management", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_marketing_original = LightevalTaskConfig( + name="mmlu:marketing", + suite=["original", "mmlu"], + prompt_function="mmlu_marketing", + hf_repo="cais/mmlu", + hf_subset="marketing", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_marketing_leaderboard = LightevalTaskConfig( + name="mmlu:marketing", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="marketing", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_marketing_helm = LightevalTaskConfig( + name="mmlu:marketing", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="marketing", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_medical_genetics_original = LightevalTaskConfig( + name="mmlu:medical_genetics", + suite=["original", "mmlu"], + prompt_function="mmlu_medical_genetics", + hf_repo="cais/mmlu", + hf_subset="medical_genetics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_medical_genetics_leaderboard = LightevalTaskConfig( + name="mmlu:medical_genetics", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="medical_genetics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_medical_genetics_helm = LightevalTaskConfig( + name="mmlu:medical_genetics", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="medical_genetics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_miscellaneous_original = LightevalTaskConfig( + name="mmlu:miscellaneous", + suite=["original", "mmlu"], + prompt_function="mmlu_miscellaneous", + hf_repo="cais/mmlu", + hf_subset="miscellaneous", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_miscellaneous_leaderboard = LightevalTaskConfig( + name="mmlu:miscellaneous", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="miscellaneous", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_miscellaneous_helm = LightevalTaskConfig( + name="mmlu:miscellaneous", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="miscellaneous", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_moral_disputes_original = LightevalTaskConfig( + name="mmlu:moral_disputes", + suite=["original", "mmlu"], + prompt_function="mmlu_moral_disputes", + hf_repo="cais/mmlu", + hf_subset="moral_disputes", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_moral_disputes_leaderboard = LightevalTaskConfig( + name="mmlu:moral_disputes", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="moral_disputes", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_moral_disputes_helm = LightevalTaskConfig( + name="mmlu:moral_disputes", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="moral_disputes", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_moral_scenarios_original = LightevalTaskConfig( + name="mmlu:moral_scenarios", + suite=["original", "mmlu"], + prompt_function="mmlu_moral_scenarios", + hf_repo="cais/mmlu", + hf_subset="moral_scenarios", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_moral_scenarios_leaderboard = LightevalTaskConfig( + name="mmlu:moral_scenarios", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="moral_scenarios", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_moral_scenarios_helm = LightevalTaskConfig( + name="mmlu:moral_scenarios", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="moral_scenarios", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_nutrition_original = LightevalTaskConfig( + name="mmlu:nutrition", + suite=["original", "mmlu"], + prompt_function="mmlu_nutrition", + hf_repo="cais/mmlu", + hf_subset="nutrition", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_nutrition_leaderboard = LightevalTaskConfig( + name="mmlu:nutrition", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="nutrition", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_nutrition_helm = LightevalTaskConfig( + name="mmlu:nutrition", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="nutrition", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_philosophy_original = LightevalTaskConfig( + name="mmlu:philosophy", + suite=["original", "mmlu"], + prompt_function="mmlu_philosophy", + hf_repo="cais/mmlu", + hf_subset="philosophy", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_philosophy_leaderboard = LightevalTaskConfig( + name="mmlu:philosophy", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="philosophy", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_philosophy_helm = LightevalTaskConfig( + name="mmlu:philosophy", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="philosophy", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_prehistory_original = LightevalTaskConfig( + name="mmlu:prehistory", + suite=["original", "mmlu"], + prompt_function="mmlu_prehistory", + hf_repo="cais/mmlu", + hf_subset="prehistory", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_prehistory_leaderboard = LightevalTaskConfig( + name="mmlu:prehistory", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="prehistory", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_prehistory_helm = LightevalTaskConfig( + name="mmlu:prehistory", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="prehistory", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_professional_accounting_original = LightevalTaskConfig( + name="mmlu:professional_accounting", + suite=["original", "mmlu"], + prompt_function="mmlu_professional_accounting", + hf_repo="cais/mmlu", + hf_subset="professional_accounting", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_professional_accounting_leaderboard = LightevalTaskConfig( + name="mmlu:professional_accounting", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="professional_accounting", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_professional_accounting_helm = LightevalTaskConfig( + name="mmlu:professional_accounting", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="professional_accounting", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_professional_law_original = LightevalTaskConfig( + name="mmlu:professional_law", + suite=["original", "mmlu"], + prompt_function="mmlu_professional_law", + hf_repo="cais/mmlu", + hf_subset="professional_law", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_professional_law_leaderboard = LightevalTaskConfig( + name="mmlu:professional_law", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="professional_law", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_professional_law_helm = LightevalTaskConfig( + name="mmlu:professional_law", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="professional_law", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_professional_medicine_original = LightevalTaskConfig( + name="mmlu:professional_medicine", + suite=["original", "mmlu"], + prompt_function="mmlu_professional_medicine", + hf_repo="cais/mmlu", + hf_subset="professional_medicine", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_professional_medicine_leaderboard = LightevalTaskConfig( + name="mmlu:professional_medicine", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="professional_medicine", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_professional_medicine_helm = LightevalTaskConfig( + name="mmlu:professional_medicine", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="professional_medicine", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_professional_psychology_original = LightevalTaskConfig( + name="mmlu:professional_psychology", + suite=["original", "mmlu"], + prompt_function="mmlu_professional_psychology", + hf_repo="cais/mmlu", + hf_subset="professional_psychology", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_professional_psychology_leaderboard = LightevalTaskConfig( + name="mmlu:professional_psychology", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="professional_psychology", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_professional_psychology_helm = LightevalTaskConfig( + name="mmlu:professional_psychology", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="professional_psychology", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_public_relations_original = LightevalTaskConfig( + name="mmlu:public_relations", + suite=["original", "mmlu"], + prompt_function="mmlu_public_relations", + hf_repo="cais/mmlu", + hf_subset="public_relations", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_public_relations_leaderboard = LightevalTaskConfig( + name="mmlu:public_relations", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="public_relations", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_public_relations_helm = LightevalTaskConfig( + name="mmlu:public_relations", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="public_relations", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_security_studies_original = LightevalTaskConfig( + name="mmlu:security_studies", + suite=["original", "mmlu"], + prompt_function="mmlu_security_studies", + hf_repo="cais/mmlu", + hf_subset="security_studies", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_security_studies_leaderboard = LightevalTaskConfig( + name="mmlu:security_studies", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="security_studies", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_security_studies_helm = LightevalTaskConfig( + name="mmlu:security_studies", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="security_studies", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_sociology_original = LightevalTaskConfig( + name="mmlu:sociology", + suite=["original", "mmlu"], + prompt_function="mmlu_sociology", + hf_repo="cais/mmlu", + hf_subset="sociology", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_sociology_leaderboard = LightevalTaskConfig( + name="mmlu:sociology", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="sociology", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_sociology_helm = LightevalTaskConfig( + name="mmlu:sociology", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="sociology", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_us_foreign_policy_original = LightevalTaskConfig( + name="mmlu:us_foreign_policy", + suite=["original", "mmlu"], + prompt_function="mmlu_us_foreign_policy", + hf_repo="cais/mmlu", + hf_subset="us_foreign_policy", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_us_foreign_policy_leaderboard = LightevalTaskConfig( + name="mmlu:us_foreign_policy", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="us_foreign_policy", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_us_foreign_policy_helm = LightevalTaskConfig( + name="mmlu:us_foreign_policy", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="us_foreign_policy", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_virology_original = LightevalTaskConfig( + name="mmlu:virology", + suite=["original", "mmlu"], + prompt_function="mmlu_virology", + hf_repo="cais/mmlu", + hf_subset="virology", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_virology_leaderboard = LightevalTaskConfig( + name="mmlu:virology", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="virology", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_virology_helm = LightevalTaskConfig( + name="mmlu:virology", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="virology", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_world_religions_original = LightevalTaskConfig( + name="mmlu:world_religions", + suite=["original", "mmlu"], + prompt_function="mmlu_world_religions", + hf_repo="cais/mmlu", + hf_subset="world_religions", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_world_religions_leaderboard = LightevalTaskConfig( + name="mmlu:world_religions", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="world_religions", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_world_religions_helm = LightevalTaskConfig( + name="mmlu:world_religions", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="world_religions", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mnist_ascii_bigbench = LightevalTaskConfig( + name="mnist_ascii", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="mnist_ascii", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +modified_arithmetic_bigbench = LightevalTaskConfig( + name="modified_arithmetic", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="modified_arithmetic", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +moral_permissibility_bigbench = LightevalTaskConfig( + name="moral_permissibility", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="moral_permissibility", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +movie_dialog_same_or_different_bigbench = LightevalTaskConfig( + name="movie_dialog_same_or_different", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="movie_dialog_same_or_different", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +movie_recommendation_bigbench = LightevalTaskConfig( + name="movie_recommendation", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="movie_recommendation", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mtnt2019_en_fr_lighteval = LightevalTaskConfig( + name="mtnt2019:en-fr", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="mtnt2019_en-fr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=200, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mtnt2019_en_ja_lighteval = LightevalTaskConfig( + name="mtnt2019:en-ja", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="mtnt2019_en-ja", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=200, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mtnt2019_fr_en_lighteval = LightevalTaskConfig( + name="mtnt2019:fr-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="mtnt2019_fr-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=200, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mtnt2019_ja_en_lighteval = LightevalTaskConfig( + name="mtnt2019:ja-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="mtnt2019_ja-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=200, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mult_data_wrangling_bigbench = LightevalTaskConfig( + name="mult_data_wrangling", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="mult_data_wrangling", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +multiemo_bigbench = LightevalTaskConfig( + name="multiemo", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="multiemo", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mutual_lighteval = LightevalTaskConfig( + name="mutual", + suite=["lighteval"], + prompt_function="mutual", + hf_repo="lighteval/mutual_harness", + hf_subset="mutual", + hf_avail_splits=["train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["recall_at_1", "recall_at_2", "mrr"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mutual_plus_lighteval = LightevalTaskConfig( + name="mutual_plus", + suite=["lighteval"], + prompt_function="mutual", + hf_repo="lighteval/mutual_harness", + hf_subset="mutual_plus", + hf_avail_splits=["train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["recall_at_1", "recall_at_2", "mrr"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +narrativeqa_helm = LightevalTaskConfig( + name="narrativeqa", + suite=["helm", "helm_general"], + prompt_function="narrativeqa", + hf_repo="lighteval/narrative_qa_helm", + hf_subset="default", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "f1_score", "rougeL", "bleu_1", "bleu_4"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +natural_instructions_bigbench = LightevalTaskConfig( + name="natural_instructions", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="natural_instructions", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["bleu", "rouge_t5"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +navigate_bigbench = LightevalTaskConfig( + name="navigate", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="navigate", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +nonsense_words_grammar_bigbench = LightevalTaskConfig( + name="nonsense_words_grammar", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="nonsense_words_grammar", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +novel_concepts_bigbench_lite = LightevalTaskConfig( + name="novel_concepts", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="novel_concepts", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +numeracy_linear_example_helm = LightevalTaskConfig( + name="numeracy:linear_example", + suite=["helm"], + prompt_function="numeracy", + hf_repo="lighteval/numeracy", + hf_subset="linear_example", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +numeracy_linear_standard_helm = LightevalTaskConfig( + name="numeracy:linear_standard", + suite=["helm"], + prompt_function="numeracy", + hf_repo="lighteval/numeracy", + hf_subset="linear_standard", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +numeracy_parabola_example_helm = LightevalTaskConfig( + name="numeracy:parabola_example", + suite=["helm"], + prompt_function="numeracy", + hf_repo="lighteval/numeracy", + hf_subset="parabola_example", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +numeracy_parabola_standard_helm = LightevalTaskConfig( + name="numeracy:parabola_standard", + suite=["helm"], + prompt_function="numeracy", + hf_repo="lighteval/numeracy", + hf_subset="parabola_standard", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +numeracy_paraboloid_example_helm = LightevalTaskConfig( + name="numeracy:paraboloid_example", + suite=["helm"], + prompt_function="numeracy", + hf_repo="lighteval/numeracy", + hf_subset="paraboloid_example", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +numeracy_paraboloid_standard_helm = LightevalTaskConfig( + name="numeracy:paraboloid_standard", + suite=["helm"], + prompt_function="numeracy", + hf_repo="lighteval/numeracy", + hf_subset="paraboloid_standard", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +numeracy_plane_example_helm = LightevalTaskConfig( + name="numeracy:plane_example", + suite=["helm"], + prompt_function="numeracy", + hf_repo="lighteval/numeracy", + hf_subset="plane_example", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +numeracy_plane_standard_helm = LightevalTaskConfig( + name="numeracy:plane_standard", + suite=["helm"], + prompt_function="numeracy", + hf_repo="lighteval/numeracy", + hf_subset="plane_standard", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +object_counting_bigbench = LightevalTaskConfig( + name="object_counting", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="object_counting", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +odd_one_out_bigbench = LightevalTaskConfig( + name="odd_one_out", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="odd_one_out", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +openbookqa_helm = LightevalTaskConfig( + name="openbookqa", + suite=["helm", "commonsense_scenario", "helm_general"], + prompt_function="openbookqa_helm", + hf_repo="openbookqa", + hf_subset="main", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +openbookqa_lighteval = LightevalTaskConfig( + name="openbookqa", + suite=["lighteval"], + prompt_function="openbookqa", + hf_repo="openbookqa", + hf_subset="main", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +operators_bigbench_lite = LightevalTaskConfig( + name="operators", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench_whitespace_after_query", + hf_repo="bigbench", + hf_subset="operators", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex="([-+]?\\d+)[.]0,1)$", + trust_dataset=True, + version=0, +) +paragraph_segmentation_bigbench = LightevalTaskConfig( + name="paragraph_segmentation", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="paragraph_segmentation", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +parsinlu_qa_bigbench = LightevalTaskConfig( + name="parsinlu_qa", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="parsinlu_qa", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +parsinlu_reading_comprehension_bigbench_lite = LightevalTaskConfig( + name="parsinlu_reading_comprehension", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench_linefeed_before_whitespace_after_query", + hf_repo="bigbench", + hf_subset="parsinlu_reading_comprehension", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["perfect_exact_match"], + stop_sequence=None, + output_regex="[^\\.\\?\\!\\;\\n]+", + trust_dataset=True, + version=0, +) +penguins_in_a_table_bigbench = LightevalTaskConfig( + name="penguins_in_a_table", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="penguins_in_a_table", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +periodic_elements_bigbench = LightevalTaskConfig( + name="periodic_elements", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="periodic_elements", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +persian_idioms_bigbench = LightevalTaskConfig( + name="persian_idioms", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="persian_idioms", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +phrase_relatedness_bigbench = LightevalTaskConfig( + name="phrase_relatedness", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="phrase_relatedness", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +physical_intuition_bigbench = LightevalTaskConfig( + name="physical_intuition", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="physical_intuition", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +physics_bigbench = LightevalTaskConfig( + name="physics", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="physics", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +physics_questions_bigbench = LightevalTaskConfig( + name="physics_questions", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="physics_questions", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["bleu", "perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +piqa_lighteval = LightevalTaskConfig( + name="piqa", + suite=["lighteval"], + prompt_function="piqa_harness", + hf_repo="piqa", + hf_subset="plain_text", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +piqa_helm = LightevalTaskConfig( + name="piqa", + suite=["helm", "commonsense_scenario"], + prompt_function="piqa_helm", + hf_repo="piqa", + hf_subset="plain_text", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +play_dialog_same_or_different_bigbench_lite = LightevalTaskConfig( + name="play_dialog_same_or_different", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench_linefeed_before_whitespace_after_query", + hf_repo="bigbench", + hf_subset="play_dialog_same_or_different", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +polish_sequence_labeling_bigbench = LightevalTaskConfig( + name="polish_sequence_labeling", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="polish_sequence_labeling", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["f1_score"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +presuppositions_as_nli_bigbench = LightevalTaskConfig( + name="presuppositions_as_nli", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="presuppositions_as_nli", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +prost_lighteval = LightevalTaskConfig( + name="prost", + suite=["lighteval"], + prompt_function="prost", + hf_repo="corypaik/prost", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +pubmedqa_lighteval = LightevalTaskConfig( + name="pubmedqa", + suite=["lighteval"], + prompt_function="pubmed_qa", + hf_repo="pubmed_qa", + hf_subset="pqa_labeled", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +pubmedqa_helm = LightevalTaskConfig( + name="pubmedqa", + suite=["helm"], + prompt_function="pubmed_qa_helm", + hf_repo="pubmed_qa", + hf_subset="pqa_labeled", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +qa4mre_2011_lighteval = LightevalTaskConfig( + name="qa4mre:2011", + suite=["lighteval"], + prompt_function="qa4mre", + hf_repo="qa4mre", + hf_subset="2011.main.EN", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +qa4mre_2012_lighteval = LightevalTaskConfig( + name="qa4mre:2012", + suite=["lighteval"], + prompt_function="qa4mre", + hf_repo="qa4mre", + hf_subset="2012.main.EN", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +qa4mre_2013_lighteval = LightevalTaskConfig( + name="qa4mre:2013", + suite=["lighteval"], + prompt_function="qa4mre", + hf_repo="qa4mre", + hf_subset="2013.main.EN", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +qa_wikidata_bigbench = LightevalTaskConfig( + name="qa_wikidata", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="qa_wikidata", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["bleurt", "bleu", "rouge_t5", "perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +qasper_lighteval = LightevalTaskConfig( + name="qasper", + suite=["lighteval"], + prompt_function="qasper", + hf_repo="qasper", + hf_subset="qasper", + hf_avail_splits=["train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=["f1_score_quasi"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +qasper_ll_lighteval = LightevalTaskConfig( + name="qasper_ll", + suite=["lighteval"], + prompt_function="qasper_ll", + hf_repo="qasper", + hf_subset="qasper", + hf_avail_splits=["train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +quac_helm = LightevalTaskConfig( + name="quac", + suite=["helm"], + prompt_function="quac", + hf_repo="lighteval/quac_helm", + hf_subset="default", + hf_avail_splits=["train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "f1_score"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +question_selection_bigbench = LightevalTaskConfig( + name="question_selection", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="question_selection", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +race_high_lighteval = LightevalTaskConfig( + name="race:high", + suite=["lighteval", "race"], + prompt_function="race", + hf_repo="EleutherAI/race", + hf_subset="high", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +raft_ade_corpus_v2_helm = LightevalTaskConfig( + name="raft:ade_corpus_v2", + suite=["helm", "helm_general"], + prompt_function="raft_ade_corpus_v2", + hf_repo="ought/raft", + hf_subset="ade_corpus_v2", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "f1_score_macro", + "f1_score_micro", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +raft_banking_77_helm = LightevalTaskConfig( + name="raft:banking_77", + suite=["helm", "helm_general"], + prompt_function="raft_banking_77", + hf_repo="ought/raft", + hf_subset="banking_77", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "f1_score_macro", + "f1_score_micro", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +raft_neurips_impact_statement_risks_helm = LightevalTaskConfig( + name="raft:neurips_impact_statement_risks", + suite=["helm", "helm_general"], + prompt_function="raft_neurips_impact_statement_risks", + hf_repo="ought/raft", + hf_subset="neurips_impact_statement_risks", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "f1_score_macro", + "f1_score_micro", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +raft_one_stop_english_helm = LightevalTaskConfig( + name="raft:one_stop_english", + suite=["helm", "helm_general"], + prompt_function="raft_one_stop_english", + hf_repo="ought/raft", + hf_subset="one_stop_english", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "f1_score_macro", + "f1_score_micro", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +raft_overruling_helm = LightevalTaskConfig( + name="raft:overruling", + suite=["helm", "helm_general"], + prompt_function="raft_overruling", + hf_repo="ought/raft", + hf_subset="overruling", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "f1_score_macro", + "f1_score_micro", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +raft_semiconductor_org_types_helm = LightevalTaskConfig( + name="raft:semiconductor_org_types", + suite=["helm", "helm_general"], + prompt_function="raft_semiconductor_org_types", + hf_repo="ought/raft", + hf_subset="semiconductor_org_types", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "f1_score_macro", + "f1_score_micro", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +raft_systematic_review_inclusion_helm = LightevalTaskConfig( + name="raft:systematic_review_inclusion", + suite=["helm", "helm_general"], + prompt_function="raft_systematic_review_inclusion", + hf_repo="ought/raft", + hf_subset="systematic_review_inclusion", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "f1_score_macro", + "f1_score_micro", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +raft_tai_safety_research_helm = LightevalTaskConfig( + name="raft:tai_safety_research", + suite=["helm", "helm_general"], + prompt_function="raft_tai_safety_research", + hf_repo="ought/raft", + hf_subset="tai_safety_research", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "f1_score_macro", + "f1_score_micro", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +raft_terms_of_service_helm = LightevalTaskConfig( + name="raft:terms_of_service", + suite=["helm", "helm_general"], + prompt_function="raft_terms_of_service", + hf_repo="ought/raft", + hf_subset="terms_of_service", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "f1_score_macro", + "f1_score_micro", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +raft_tweet_eval_hate_helm = LightevalTaskConfig( + name="raft:tweet_eval_hate", + suite=["helm", "helm_general"], + prompt_function="raft_tweet_eval_hate", + hf_repo="ought/raft", + hf_subset="tweet_eval_hate", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "f1_score_macro", + "f1_score_micro", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +raft_twitter_complaints_helm = LightevalTaskConfig( + name="raft:twitter_complaints", + suite=["helm", "helm_general"], + prompt_function="raft_twitter_complaints", + hf_repo="ought/raft", + hf_subset="twitter_complaints", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "f1_score_macro", + "f1_score_micro", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +real_or_fake_text_bigbench = LightevalTaskConfig( + name="real_or_fake_text", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="real_or_fake_text", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +real_toxicity_prompts_helm = LightevalTaskConfig( + name="real_toxicity_prompts", + suite=["helm"], + prompt_function="real_toxicity_prompts", + hf_repo="allenai/real-toxicity-prompts", + hf_subset="default", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=["prediction_perplexity"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +reasoning_about_colored_objects_bigbench = LightevalTaskConfig( + name="reasoning_about_colored_objects", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="reasoning_about_colored_objects", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +repeat_copy_logic_bigbench_lite = LightevalTaskConfig( + name="repeat_copy_logic", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench_whitespace_after_query", + hf_repo="bigbench", + hf_subset="repeat_copy_logic", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +rephrase_bigbench = LightevalTaskConfig( + name="rephrase", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="rephrase", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["rouge_t5", "bleu", "loglikelihood_acc", "perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +rhyming_bigbench = LightevalTaskConfig( + name="rhyming", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="rhyming", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +riddle_sense_bigbench = LightevalTaskConfig( + name="riddle_sense", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="riddle_sense", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +ruin_names_bigbench = LightevalTaskConfig( + name="ruin_names", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="ruin_names", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +salient_translation_error_detection_bigbench = LightevalTaskConfig( + name="salient_translation_error_detection", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="salient_translation_error_detection", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +scientific_press_release_bigbench = LightevalTaskConfig( + name="scientific_press_release", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="scientific_press_release", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["bleu", "perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +sciq_lighteval = LightevalTaskConfig( + name="sciq", + suite=["lighteval"], + prompt_function="sciq", + hf_repo="sciq", + hf_subset="default", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +semantic_parsing_in_context_sparc_bigbench = LightevalTaskConfig( + name="semantic_parsing_in_context_sparc", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="semantic_parsing_in_context_sparc", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["bleu", "rouge_t5", "perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +semantic_parsing_spider_bigbench = LightevalTaskConfig( + name="semantic_parsing_spider", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="semantic_parsing_spider", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["bleu", "rouge_t5", "perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +sentence_ambiguity_bigbench = LightevalTaskConfig( + name="sentence_ambiguity", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="sentence_ambiguity", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +similarities_abstraction_bigbench = LightevalTaskConfig( + name="similarities_abstraction", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="similarities_abstraction", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["bleu", "rouge_t5", "loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +simp_turing_concept_bigbench = LightevalTaskConfig( + name="simp_turing_concept", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="simp_turing_concept", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +simple_arithmetic_json_bigbench = LightevalTaskConfig( + name="simple_arithmetic_json", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="simple_arithmetic_json", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +simple_arithmetic_json_multiple_choice_bigbench = LightevalTaskConfig( + name="simple_arithmetic_json_multiple_choice", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="simple_arithmetic_json_multiple_choice", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +simple_arithmetic_json_subtasks_bigbench = LightevalTaskConfig( + name="simple_arithmetic_json_subtasks", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="simple_arithmetic_json_subtasks", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +simple_arithmetic_multiple_targets_json_bigbench = LightevalTaskConfig( + name="simple_arithmetic_multiple_targets_json", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="simple_arithmetic_multiple_targets_json", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["bleu", "perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +simple_ethical_questions_bigbench = LightevalTaskConfig( + name="simple_ethical_questions", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="simple_ethical_questions", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +simple_text_editing_bigbench = LightevalTaskConfig( + name="simple_text_editing", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="simple_text_editing", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +siqa_helm = LightevalTaskConfig( + name="siqa", + suite=["helm", "commonsense_scenario"], + prompt_function="siqa", + hf_repo="social_i_qa", + hf_subset="default", + hf_avail_splits=["train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +snarks_bigbench = LightevalTaskConfig( + name="snarks", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="snarks", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +social_iqa_bigbench = LightevalTaskConfig( + name="social_iqa", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="social_iqa", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +social_support_bigbench = LightevalTaskConfig( + name="social_support", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="social_support", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["f1_score_macro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +sports_understanding_bigbench = LightevalTaskConfig( + name="sports_understanding", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="sports_understanding", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +storycloze_2016_lighteval = LightevalTaskConfig( + name="storycloze:2016", + suite=["lighteval", "storycloze"], + prompt_function="storycloze", + hf_repo="story_cloze", + hf_subset="2016", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +storycloze_2018_lighteval = LightevalTaskConfig( + name="storycloze:2018", + suite=["lighteval", "storycloze"], + prompt_function="storycloze", + hf_repo="story_cloze", + hf_subset="2018", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +strange_stories_bigbench_lite = LightevalTaskConfig( + name="strange_stories", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench_whitespace_after_query", + hf_repo="bigbench", + hf_subset="strange_stories", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +strategyqa_bigbench_lite = LightevalTaskConfig( + name="strategyqa", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench_linefeed_before_whitespace_after_query", + hf_repo="bigbench", + hf_subset="strategyqa", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["bleu", "rouge_t5", "loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +sufficient_information_bigbench = LightevalTaskConfig( + name="sufficient_information", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="sufficient_information", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +suicide_risk_bigbench = LightevalTaskConfig( + name="suicide_risk", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="suicide_risk", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +summarization_cnn_dm_helm = LightevalTaskConfig( + name="summarization:cnn-dm", + suite=["helm", "helm_general"], + prompt_function="cnn_dm", + hf_repo="lighteval/summarization", + hf_subset="cnn-dm", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=["rouge1", "rouge2", "rougeL", "faithfulness", "extractiveness", "bert_score"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +summarization_xsum_helm = LightevalTaskConfig( + name="summarization:xsum", + suite=["helm", "helm_general"], + prompt_function="xsum", + hf_repo="lighteval/summarization", + hf_subset="xsum", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=64, + metric=["rouge1", "rouge2", "rougeL", "faithfulness", "extractiveness", "bert_score"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +summarization_xsum_sampled_helm = LightevalTaskConfig( + name="summarization:xsum-sampled", + suite=["helm"], + prompt_function="xsum", + hf_repo="lighteval/summarization", + hf_subset="xsum-sampled", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=64, + metric=["rouge1", "rouge2", "rougeL", "faithfulness", "extractiveness", "bert_score"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +super_glue_boolq_lighteval = LightevalTaskConfig( + name="super_glue:boolq", + suite=["lighteval", "superglue"], + prompt_function="boolq_harness", + hf_repo="super_glue", + hf_subset="boolq", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +super_glue_cb_lighteval = LightevalTaskConfig( + name="super_glue:cb", + suite=["lighteval", "superglue"], + prompt_function="cb", + hf_repo="super_glue", + hf_subset="cb", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc_single_token", "multi_f1_numeric"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +super_glue_copa_lighteval = LightevalTaskConfig( + name="super_glue:copa", + suite=["lighteval", "superglue"], + prompt_function="copa", + hf_repo="super_glue", + hf_subset="copa", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +super_glue_rte_lighteval = LightevalTaskConfig( + name="super_glue:rte", + suite=["lighteval", "superglue"], + prompt_function="rte", + hf_repo="super_glue", + hf_subset="rte", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +super_glue_multirc_lighteval = LightevalTaskConfig( + name="super_glue:multirc", + suite=["lighteval", "superglue"], + prompt_function="multirc", + hf_repo="super_glue", + hf_subset="multirc", + hf_avail_splits=["train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +super_glue_wic_lighteval = LightevalTaskConfig( + name="super_glue:wic", + suite=["lighteval", "superglue"], + prompt_function="wic", + hf_repo="super_glue", + hf_subset="wic", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +super_glue_wsc_lighteval = LightevalTaskConfig( + name="super_glue:wsc", + suite=["lighteval", "superglue"], + prompt_function="wsc", + hf_repo="super_glue", + hf_subset="wsc", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +swahili_english_proverbs_bigbench = LightevalTaskConfig( + name="swahili_english_proverbs", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="swahili_english_proverbs", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +swag_lighteval = LightevalTaskConfig( + name="swag", + suite=["lighteval"], + prompt_function="swag", + hf_repo="swag", + hf_subset="regular", + hf_avail_splits=["train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +swedish_to_german_proverbs_bigbench = LightevalTaskConfig( + name="swedish_to_german_proverbs", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="swedish_to_german_proverbs", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +symbol_interpretation_bigbench_lite = LightevalTaskConfig( + name="symbol_interpretation", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench_linefeed_before_whitespace_after_query", + hf_repo="bigbench", + hf_subset="symbol_interpretation", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +synthetic_reasoning_induction_helm = LightevalTaskConfig( + name="synthetic_reasoning:induction", + suite=["helm"], + prompt_function="synthetic_reasoning", + hf_repo="lighteval/synthetic_reasoning", + hf_subset="induction", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=50, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +synthetic_reasoning_natural_easy_helm = LightevalTaskConfig( + name="synthetic_reasoning:natural_easy", + suite=["helm"], + prompt_function="synthetic_reasoning_natural", + hf_repo="lighteval/synthetic_reasoning_natural", + hf_subset="easy", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=["exact_match", "f1_score"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +synthetic_reasoning_natural_hard_helm = LightevalTaskConfig( + name="synthetic_reasoning:natural_hard", + suite=["helm"], + prompt_function="synthetic_reasoning_natural", + hf_repo="lighteval/synthetic_reasoning_natural", + hf_subset="hard", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=["exact_match", "f1_score"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +synthetic_reasoning_pattern_match_helm = LightevalTaskConfig( + name="synthetic_reasoning:pattern_match", + suite=["helm"], + prompt_function="synthetic_reasoning", + hf_repo="lighteval/synthetic_reasoning", + hf_subset="pattern_match", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=50, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +synthetic_reasoning_variable_substitution_helm = LightevalTaskConfig( + name="synthetic_reasoning:variable_substitution", + suite=["helm"], + prompt_function="synthetic_reasoning", + hf_repo="lighteval/synthetic_reasoning", + hf_subset="variable_substitution", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=50, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +tellmewhy_bigbench = LightevalTaskConfig( + name="tellmewhy", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="tellmewhy", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["bleu", "rouge_t5"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +temporal_sequences_bigbench = LightevalTaskConfig( + name="temporal_sequences", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="temporal_sequences", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +tense_bigbench = LightevalTaskConfig( + name="tense", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="tense", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_arxiv_lighteval = LightevalTaskConfig( + name="the_pile:arxiv", + suite=["lighteval", "pile"], + prompt_function="the_pile", + hf_repo="lighteval/pile", + hf_subset="pile_arxiv", + hf_avail_splits=["validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_arxiv_helm = LightevalTaskConfig( + name="the_pile:arxiv", + suite=["helm"], + prompt_function="the_pile", + hf_repo="lighteval/pile_helm", + hf_subset="arxiv", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_bibliotik_helm = LightevalTaskConfig( + name="the_pile:bibliotik", + suite=["helm"], + prompt_function="the_pile", + hf_repo="lighteval/pile_helm", + hf_subset="bibliotik", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_bookcorpus2_lighteval = LightevalTaskConfig( + name="the_pile:bookcorpus2", + suite=["lighteval", "pile"], + prompt_function="the_pile", + hf_repo="lighteval/pile", + hf_subset="pile_bookcorpus2", + hf_avail_splits=["validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_books3_lighteval = LightevalTaskConfig( + name="the_pile:books3", + suite=["lighteval", "pile"], + prompt_function="the_pile", + hf_repo="lighteval/pile", + hf_subset="pile_books3", + hf_avail_splits=["validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_commoncrawl_helm = LightevalTaskConfig( + name="the_pile:commoncrawl", + suite=["helm"], + prompt_function="the_pile", + hf_repo="lighteval/pile_helm", + hf_subset="commoncrawl", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_dm_mathematics_lighteval = LightevalTaskConfig( + name="the_pile:dm-mathematics", + suite=["lighteval", "pile"], + prompt_function="the_pile", + hf_repo="lighteval/pile", + hf_subset="pile_dm-mathematics", + hf_avail_splits=["validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_dm_mathematics_helm = LightevalTaskConfig( + name="the_pile:dm-mathematics", + suite=["helm"], + prompt_function="the_pile", + hf_repo="lighteval/pile_helm", + hf_subset="dm-mathematics", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_enron_lighteval = LightevalTaskConfig( + name="the_pile:enron", + suite=["lighteval", "pile"], + prompt_function="the_pile", + hf_repo="lighteval/pile", + hf_subset="pile_enron", + hf_avail_splits=["validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_enron_helm = LightevalTaskConfig( + name="the_pile:enron", + suite=["helm"], + prompt_function="the_pile", + hf_repo="lighteval/pile_helm", + hf_subset="enron", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_europarl_lighteval = LightevalTaskConfig( + name="the_pile:europarl", + suite=["lighteval", "pile"], + prompt_function="the_pile", + hf_repo="lighteval/pile", + hf_subset="pile_europarl", + hf_avail_splits=["validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_europarl_helm = LightevalTaskConfig( + name="the_pile:europarl", + suite=["helm"], + prompt_function="the_pile", + hf_repo="lighteval/pile_helm", + hf_subset="europarl", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_freelaw_lighteval = LightevalTaskConfig( + name="the_pile:freelaw", + suite=["lighteval", "pile"], + prompt_function="the_pile", + hf_repo="lighteval/pile", + hf_subset="pile_freelaw", + hf_avail_splits=["validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_freelaw_helm = LightevalTaskConfig( + name="the_pile:freelaw", + suite=["helm"], + prompt_function="the_pile", + hf_repo="lighteval/pile_helm", + hf_subset="freelaw", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_github_lighteval = LightevalTaskConfig( + name="the_pile:github", + suite=["lighteval", "pile"], + prompt_function="the_pile", + hf_repo="lighteval/pile", + hf_subset="pile_github", + hf_avail_splits=["validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_github_helm = LightevalTaskConfig( + name="the_pile:github", + suite=["helm"], + prompt_function="the_pile", + hf_repo="lighteval/pile_helm", + hf_subset="github", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_gutenberg_lighteval = LightevalTaskConfig( + name="the_pile:gutenberg", + suite=["lighteval", "pile"], + prompt_function="the_pile", + hf_repo="lighteval/pile", + hf_subset="pile_gutenberg", + hf_avail_splits=["validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_gutenberg_helm = LightevalTaskConfig( + name="the_pile:gutenberg", + suite=["helm"], + prompt_function="the_pile", + hf_repo="lighteval/pile_helm", + hf_subset="gutenberg", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_hackernews_lighteval = LightevalTaskConfig( + name="the_pile:hackernews", + suite=["lighteval", "pile"], + prompt_function="the_pile", + hf_repo="lighteval/pile", + hf_subset="pile_hackernews", + hf_avail_splits=["validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_hackernews_helm = LightevalTaskConfig( + name="the_pile:hackernews", + suite=["helm"], + prompt_function="the_pile", + hf_repo="lighteval/pile_helm", + hf_subset="hackernews", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_nih_exporter_lighteval = LightevalTaskConfig( + name="the_pile:nih-exporter", + suite=["lighteval", "pile"], + prompt_function="the_pile", + hf_repo="lighteval/pile", + hf_subset="pile_nih-exporter", + hf_avail_splits=["validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_nih_exporter_helm = LightevalTaskConfig( + name="the_pile:nih-exporter", + suite=["helm"], + prompt_function="the_pile", + hf_repo="lighteval/pile_helm", + hf_subset="nih-exporter", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_opensubtitles_lighteval = LightevalTaskConfig( + name="the_pile:opensubtitles", + suite=["lighteval", "pile"], + prompt_function="the_pile", + hf_repo="lighteval/pile", + hf_subset="pile_opensubtitles", + hf_avail_splits=["validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_opensubtitles_helm = LightevalTaskConfig( + name="the_pile:opensubtitles", + suite=["helm"], + prompt_function="the_pile", + hf_repo="lighteval/pile_helm", + hf_subset="opensubtitles", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_openwebtext2_lighteval = LightevalTaskConfig( + name="the_pile:openwebtext2", + suite=["lighteval", "pile"], + prompt_function="the_pile", + hf_repo="lighteval/pile", + hf_subset="pile_openwebtext2", + hf_avail_splits=["validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_openwebtext2_helm = LightevalTaskConfig( + name="the_pile:openwebtext2", + suite=["helm"], + prompt_function="the_pile", + hf_repo="lighteval/pile_helm", + hf_subset="openwebtext2", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_philpapers_lighteval = LightevalTaskConfig( + name="the_pile:philpapers", + suite=["lighteval", "pile"], + prompt_function="the_pile", + hf_repo="lighteval/pile", + hf_subset="pile_philpapers", + hf_avail_splits=["validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_pile_cc_lighteval = LightevalTaskConfig( + name="the_pile:pile-cc", + suite=["lighteval", "pile"], + prompt_function="the_pile", + hf_repo="lighteval/pile", + hf_subset="pile_pile-cc", + hf_avail_splits=["validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_pubmed_abstracts_lighteval = LightevalTaskConfig( + name="the_pile:pubmed-abstracts", + suite=["lighteval", "pile"], + prompt_function="the_pile", + hf_repo="lighteval/pile", + hf_subset="pile_pubmed-abstracts", + hf_avail_splits=["validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_pubmed_abstracts_helm = LightevalTaskConfig( + name="the_pile:pubmed-abstracts", + suite=["helm"], + prompt_function="the_pile", + hf_repo="lighteval/pile_helm", + hf_subset="pubmed-abstracts", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_pubmed_central_lighteval = LightevalTaskConfig( + name="the_pile:pubmed-central", + suite=["lighteval", "pile"], + prompt_function="the_pile", + hf_repo="lighteval/pile", + hf_subset="pile_pubmed-central", + hf_avail_splits=["validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_pubmed_central_helm = LightevalTaskConfig( + name="the_pile:pubmed-central", + suite=["helm"], + prompt_function="the_pile", + hf_repo="lighteval/pile_helm", + hf_subset="pubmed-central", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_stackexchange_lighteval = LightevalTaskConfig( + name="the_pile:stackexchange", + suite=["lighteval", "pile"], + prompt_function="the_pile", + hf_repo="lighteval/pile", + hf_subset="pile_stackexchange", + hf_avail_splits=["validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_stackexchange_helm = LightevalTaskConfig( + name="the_pile:stackexchange", + suite=["helm"], + prompt_function="the_pile", + hf_repo="lighteval/pile_helm", + hf_subset="stackexchange", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_ubuntu_irc_lighteval = LightevalTaskConfig( + name="the_pile:ubuntu-irc", + suite=["lighteval", "pile"], + prompt_function="the_pile", + hf_repo="lighteval/pile", + hf_subset="pile_ubuntu-irc", + hf_avail_splits=["validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_uspto_lighteval = LightevalTaskConfig( + name="the_pile:uspto", + suite=["lighteval", "pile"], + prompt_function="the_pile", + hf_repo="lighteval/pile", + hf_subset="pile_upsto", + hf_avail_splits=["validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_upsto_helm = LightevalTaskConfig( + name="the_pile:upsto", + suite=["helm"], + prompt_function="the_pile", + hf_repo="lighteval/pile_helm", + hf_subset="uspto", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_wikipedia_lighteval = LightevalTaskConfig( + name="the_pile:wikipedia", + suite=["lighteval", "pile"], + prompt_function="the_pile", + hf_repo="lighteval/pile", + hf_subset="pile_wikipedia", + hf_avail_splits=["validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_wikipedia_helm = LightevalTaskConfig( + name="the_pile:wikipedia", + suite=["helm"], + prompt_function="the_pile", + hf_repo="lighteval/pile_helm", + hf_subset="wikipedia", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_youtubesubtitles_lighteval = LightevalTaskConfig( + name="the_pile:youtubesubtitles", + suite=["lighteval", "pile"], + prompt_function="the_pile", + hf_repo="lighteval/pile", + hf_subset="pile_youtubesubtitles", + hf_avail_splits=["validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_youtubesubtitles_helm = LightevalTaskConfig( + name="the_pile:youtubesubtitles", + suite=["helm"], + prompt_function="the_pile", + hf_repo="lighteval/pile_helm", + hf_subset="youtubesubtitles", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +timedial_bigbench = LightevalTaskConfig( + name="timedial", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="timedial", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +toxigen_lighteval = LightevalTaskConfig( + name="toxigen", + suite=["lighteval"], + prompt_function="toxigen", + hf_repo="skg/toxigen-data", + hf_subset="annotated", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +topical_chat_bigbench = LightevalTaskConfig( + name="topical_chat", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="topical_chat", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["bleu", "rouge_t5", "loglikelihood_acc", "bleurt"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +tracking_shuffled_objects_bigbench = LightevalTaskConfig( + name="tracking_shuffled_objects", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="tracking_shuffled_objects", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +triviaqa_lighteval = LightevalTaskConfig( + name="triviaqa", + suite=["lighteval"], + prompt_function="triviaqa", + hf_repo="trivia_qa", + hf_subset="rc.nocontext", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=["quasi_exact_match_triviaqa"], + stop_sequence=["\n", ".", ","], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +truthfulqa_gen_lighteval = LightevalTaskConfig( + name="truthfulqa:gen", + suite=["lighteval"], + prompt_function="truthful_qa_generative", + hf_repo="truthful_qa", + hf_subset="generation", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=200, + metric=["bleu", "rouge_t5"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +truthfulqa_mc_leaderboard = LightevalTaskConfig( + name="truthfulqa:mc", + suite=["leaderboard"], + prompt_function="truthful_qa_multiple_choice", + hf_repo="truthful_qa", + hf_subset="multiple_choice", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["truthfulqa_mc_metrics"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +truthfulqa_helm = LightevalTaskConfig( + name="truthfulqa", + suite=["helm", "helm_general"], + prompt_function="truthful_qa_helm", + hf_repo="lighteval/truthfulqa_helm", + hf_subset="default", + hf_avail_splits=["train", "valid"], + evaluation_splits=["valid"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["loglikelihood_acc", "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +twitterAAE_aa_helm = LightevalTaskConfig( + name="twitterAAE:aa", + suite=["helm"], + prompt_function="twitter_aae", + hf_repo="lighteval/twitterAAE", + hf_subset="aa", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +twitterAAE_white_helm = LightevalTaskConfig( + name="twitterAAE:white", + suite=["helm"], + prompt_function="twitter_aae", + hf_repo="lighteval/twitterAAE", + hf_subset="white", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +understanding_fables_bigbench = LightevalTaskConfig( + name="understanding_fables", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="understanding_fables", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +undo_permutation_bigbench = LightevalTaskConfig( + name="undo_permutation", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="undo_permutation", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +unit_conversion_bigbench = LightevalTaskConfig( + name="unit_conversion", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="unit_conversion", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +unit_interpretation_bigbench = LightevalTaskConfig( + name="unit_interpretation", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="unit_interpretation", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +unnatural_in_context_learning_bigbench = LightevalTaskConfig( + name="unnatural_in_context_learning", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="unnatural_in_context_learning", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +unscramble_anagrams1_lighteval = LightevalTaskConfig( + name="unscramble:anagrams1", + suite=["lighteval", "unscramble"], + prompt_function="unscramble", + hf_repo="lighteval/GPT3_unscramble", + hf_subset="default", + hf_avail_splits=["mid_word_1_anagrams"], + evaluation_splits=["mid_word_1_anagrams"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +unscramble_anagrams2_lighteval = LightevalTaskConfig( + name="unscramble:anagrams2", + suite=["lighteval", "unscramble"], + prompt_function="unscramble", + hf_repo="lighteval/GPT3_unscramble", + hf_subset="default", + hf_avail_splits=["mid_word_2_anagrams"], + evaluation_splits=["mid_word_2_anagrams"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +unscramble_cycle_letters_lighteval = LightevalTaskConfig( + name="unscramble:cycle_letters", + suite=["lighteval", "unscramble"], + prompt_function="unscramble", + hf_repo="lighteval/GPT3_unscramble", + hf_subset="default", + hf_avail_splits=["cycle_letters_in_word"], + evaluation_splits=["cycle_letters_in_word"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +unscramble_random_insertion_lighteval = LightevalTaskConfig( + name="unscramble:random_insertion", + suite=["lighteval", "unscramble"], + prompt_function="unscramble", + hf_repo="lighteval/GPT3_unscramble", + hf_subset="default", + hf_avail_splits=["random_insertion_in_word"], + evaluation_splits=["random_insertion_in_word"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +unscramble_reversed_words_lighteval = LightevalTaskConfig( + name="unscramble:reversed_words", + suite=["lighteval", "unscramble"], + prompt_function="unscramble", + hf_repo="lighteval/GPT3_unscramble", + hf_subset="default", + hf_avail_splits=["reversed_words"], + evaluation_splits=["reversed_words"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +vitaminc_fact_verification_bigbench_lite = LightevalTaskConfig( + name="vitaminc_fact_verification", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench_whitespace_after_query", + hf_repo="bigbench", + hf_subset="vitaminc_fact_verification", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +webqs_lighteval = LightevalTaskConfig( + name="webqs", + suite=["lighteval"], + prompt_function="webqs", + hf_repo="web_questions", + hf_subset="default", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["acc_golds_likelihood"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +what_is_the_tao_bigbench = LightevalTaskConfig( + name="what_is_the_tao", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="what_is_the_tao", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +which_wiki_edit_bigbench = LightevalTaskConfig( + name="which_wiki_edit", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="which_wiki_edit", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_applies_to_jurisdiction_helm = LightevalTaskConfig( + name="wikifact:applies_to_jurisdiction", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="applies_to_jurisdiction", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_atomic_number_helm = LightevalTaskConfig( + name="wikifact:atomic_number", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="atomic_number", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_author_helm = LightevalTaskConfig( + name="wikifact:author", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="author", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_award_received_helm = LightevalTaskConfig( + name="wikifact:award_received", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="award_received", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_basic_form_of_government_helm = LightevalTaskConfig( + name="wikifact:basic_form_of_government", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="basic_form_of_government", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_capital_helm = LightevalTaskConfig( + name="wikifact:capital", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="capital", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_capital_of_helm = LightevalTaskConfig( + name="wikifact:capital_of", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="capital_of", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_central_bank_helm = LightevalTaskConfig( + name="wikifact:central_bank", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="central_bank", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_composer_helm = LightevalTaskConfig( + name="wikifact:composer", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="composer", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_continent_helm = LightevalTaskConfig( + name="wikifact:continent", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="continent", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_country_helm = LightevalTaskConfig( + name="wikifact:country", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="country", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_country_of_citizenship_helm = LightevalTaskConfig( + name="wikifact:country_of_citizenship", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="country_of_citizenship", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_country_of_origin_helm = LightevalTaskConfig( + name="wikifact:country_of_origin", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="country_of_origin", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_creator_helm = LightevalTaskConfig( + name="wikifact:creator", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="creator", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_currency_helm = LightevalTaskConfig( + name="wikifact:currency", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="currency", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_defendant_helm = LightevalTaskConfig( + name="wikifact:defendant", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="defendant", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_developer_helm = LightevalTaskConfig( + name="wikifact:developer", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="developer", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_diplomatic_relation_helm = LightevalTaskConfig( + name="wikifact:diplomatic_relation", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="diplomatic_relation", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_director_helm = LightevalTaskConfig( + name="wikifact:director", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="director", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_discoverer_or_inventor_helm = LightevalTaskConfig( + name="wikifact:discoverer_or_inventor", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="discoverer_or_inventor", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_drug_or_therapy_used_for_treatment_helm = LightevalTaskConfig( + name="wikifact:drug_or_therapy_used_for_treatment", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="drug_or_therapy_used_for_treatment", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_educated_at_helm = LightevalTaskConfig( + name="wikifact:educated_at", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="educated_at", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_electron_configuration_helm = LightevalTaskConfig( + name="wikifact:electron_configuration", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="electron_configuration", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_employer_helm = LightevalTaskConfig( + name="wikifact:employer", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="employer", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_field_of_work_helm = LightevalTaskConfig( + name="wikifact:field_of_work", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="field_of_work", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_file_extension_helm = LightevalTaskConfig( + name="wikifact:file_extension", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="file_extension", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_genetic_association_helm = LightevalTaskConfig( + name="wikifact:genetic_association", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="genetic_association", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_genre_helm = LightevalTaskConfig( + name="wikifact:genre", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="genre", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_has_part_helm = LightevalTaskConfig( + name="wikifact:has_part", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="has_part", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_head_of_government_helm = LightevalTaskConfig( + name="wikifact:head_of_government", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="head_of_government", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_head_of_state_helm = LightevalTaskConfig( + name="wikifact:head_of_state", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="head_of_state", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_headquarters_location_helm = LightevalTaskConfig( + name="wikifact:headquarters_location", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="headquarters_location", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_industry_helm = LightevalTaskConfig( + name="wikifact:industry", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="industry", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_influenced_by_helm = LightevalTaskConfig( + name="wikifact:influenced_by", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="influenced_by", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_instance_of_helm = LightevalTaskConfig( + name="wikifact:instance_of", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="instance_of", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_instrument_helm = LightevalTaskConfig( + name="wikifact:instrument", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="instrument", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_language_of_work_or_name_helm = LightevalTaskConfig( + name="wikifact:language_of_work_or_name", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="language_of_work_or_name", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_languages_spoken_written_or_signed_helm = LightevalTaskConfig( + name="wikifact:languages_spoken_written_or_signed", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="languages_spoken_written_or_signed", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_laws_applied_helm = LightevalTaskConfig( + name="wikifact:laws_applied", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="laws_applied", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_located_in_the_administrative_territorial_entity_helm = LightevalTaskConfig( + name="wikifact:located_in_the_administrative_territorial_entity", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="located_in_the_administrative_territorial_entity", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_location_helm = LightevalTaskConfig( + name="wikifact:location", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="location", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_location_of_discovery_helm = LightevalTaskConfig( + name="wikifact:location_of_discovery", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="location_of_discovery", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_location_of_formation_helm = LightevalTaskConfig( + name="wikifact:location_of_formation", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="location_of_formation", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_majority_opinion_by_helm = LightevalTaskConfig( + name="wikifact:majority_opinion_by", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="majority_opinion_by", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_manufacturer_helm = LightevalTaskConfig( + name="wikifact:manufacturer", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="manufacturer", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_measured_physical_quantity_helm = LightevalTaskConfig( + name="wikifact:measured_physical_quantity", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="measured_physical_quantity", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_medical_condition_treated_helm = LightevalTaskConfig( + name="wikifact:medical_condition_treated", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="medical_condition_treated", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_member_of_helm = LightevalTaskConfig( + name="wikifact:member_of", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="member_of", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_member_of_political_party_helm = LightevalTaskConfig( + name="wikifact:member_of_political_party", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="member_of_political_party", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_member_of_sports_team_helm = LightevalTaskConfig( + name="wikifact:member_of_sports_team", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="member_of_sports_team", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_movement_helm = LightevalTaskConfig( + name="wikifact:movement", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="movement", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_named_after_helm = LightevalTaskConfig( + name="wikifact:named_after", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="named_after", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_native_language_helm = LightevalTaskConfig( + name="wikifact:native_language", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="native_language", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_number_of_processor_cores_helm = LightevalTaskConfig( + name="wikifact:number_of_processor_cores", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="number_of_processor_cores", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_occupation_helm = LightevalTaskConfig( + name="wikifact:occupation", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="occupation", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_office_held_by_head_of_government_helm = LightevalTaskConfig( + name="wikifact:office_held_by_head_of_government", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="office_held_by_head_of_government", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_office_held_by_head_of_state_helm = LightevalTaskConfig( + name="wikifact:office_held_by_head_of_state", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="office_held_by_head_of_state", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_official_language_helm = LightevalTaskConfig( + name="wikifact:official_language", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="official_language", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_operating_system_helm = LightevalTaskConfig( + name="wikifact:operating_system", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="operating_system", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_original_language_of_film_or_TV_show_helm = LightevalTaskConfig( + name="wikifact:original_language_of_film_or_TV_show", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="original_language_of_film_or_TV_show", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_original_network_helm = LightevalTaskConfig( + name="wikifact:original_network", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="original_network", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_overrules_helm = LightevalTaskConfig( + name="wikifact:overrules", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="overrules", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_owned_by_helm = LightevalTaskConfig( + name="wikifact:owned_by", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="owned_by", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_part_of_helm = LightevalTaskConfig( + name="wikifact:part_of", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="part_of", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_participating_team_helm = LightevalTaskConfig( + name="wikifact:participating_team", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="participating_team", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_place_of_birth_helm = LightevalTaskConfig( + name="wikifact:place_of_birth", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="place_of_birth", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_place_of_death_helm = LightevalTaskConfig( + name="wikifact:place_of_death", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="place_of_death", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_plaintiff_helm = LightevalTaskConfig( + name="wikifact:plaintiff", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="plaintiff", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_position_held_helm = LightevalTaskConfig( + name="wikifact:position_held", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="position_held", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_position_played_on_team_helm = LightevalTaskConfig( + name="wikifact:position_played_on_team", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="position_played_on_team", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_programming_language_helm = LightevalTaskConfig( + name="wikifact:programming_language", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="programming_language", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_recommended_unit_of_measurement_helm = LightevalTaskConfig( + name="wikifact:recommended_unit_of_measurement", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="recommended_unit_of_measurement", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_record_label_helm = LightevalTaskConfig( + name="wikifact:record_label", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="record_label", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_religion_helm = LightevalTaskConfig( + name="wikifact:religion", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="religion", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_repealed_by_helm = LightevalTaskConfig( + name="wikifact:repealed_by", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="repealed_by", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_shares_border_with_helm = LightevalTaskConfig( + name="wikifact:shares_border_with", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="shares_border_with", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_solved_by_helm = LightevalTaskConfig( + name="wikifact:solved_by", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="solved_by", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_statement_describes_helm = LightevalTaskConfig( + name="wikifact:statement_describes", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="statement_describes", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_stock_exchange_helm = LightevalTaskConfig( + name="wikifact:stock_exchange", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="stock_exchange", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_subclass_of_helm = LightevalTaskConfig( + name="wikifact:subclass_of", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="subclass_of", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_subsidiary_helm = LightevalTaskConfig( + name="wikifact:subsidiary", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="subsidiary", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_symptoms_and_signs_helm = LightevalTaskConfig( + name="wikifact:symptoms_and_signs", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="symptoms_and_signs", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_therapeutic_area_helm = LightevalTaskConfig( + name="wikifact:therapeutic_area", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="therapeutic_area", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_time_of_discovery_or_invention_helm = LightevalTaskConfig( + name="wikifact:time_of_discovery_or_invention", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="time_of_discovery_or_invention", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_twinned_administrative_body_helm = LightevalTaskConfig( + name="wikifact:twinned_administrative_body", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="twinned_administrative_body", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_work_location_helm = LightevalTaskConfig( + name="wikifact:work_location", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="work_location", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikitext_2_lighteval = LightevalTaskConfig( + name="wikitext:2", + suite=["lighteval"], + prompt_function="wikitext", + hf_repo="wikitext", + hf_subset="wikitext-2-raw-v1", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikitext_103_document_level_harness = LightevalTaskConfig( + name="wikitext:103:document_level", + suite=["harness"], + prompt_function="wikitext_harness", + hf_repo="EleutherAI/wikitext_document_level", + hf_subset="wikitext-103-raw-v1", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikitext_103_document_level_helm = LightevalTaskConfig( + name="wikitext:103:document_level", + suite=["helm"], + prompt_function="wikitext_helm", + hf_repo="EleutherAI/wikitext_document_level", + hf_subset="wikitext-103-raw-v1", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wino_x_german_bigbench = LightevalTaskConfig( + name="wino_x_german", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="wino_x_german", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +winogrande_leaderboard = LightevalTaskConfig( + name="winogrande", + suite=["leaderboard"], + prompt_function="winogrande", + hf_repo="winogrande", + hf_subset="winogrande_xl", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +winowhy_bigbench_lite = LightevalTaskConfig( + name="winowhy", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench_whitespace_after_query", + hf_repo="bigbench", + hf_subset="winowhy", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt08_cs_en_lighteval = LightevalTaskConfig( + name="wmt08:cs-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt08_cs-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt08_de_en_lighteval = LightevalTaskConfig( + name="wmt08:de-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt08_de-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt08_en_cs_lighteval = LightevalTaskConfig( + name="wmt08:en-cs", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt08_en-cs", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt08_en_de_lighteval = LightevalTaskConfig( + name="wmt08:en-de", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt08_en-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt08_en_es_lighteval = LightevalTaskConfig( + name="wmt08:en-es", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt08_en-es", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt08_en_fr_lighteval = LightevalTaskConfig( + name="wmt08:en-fr", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt08_en-fr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt08_en_hu_lighteval = LightevalTaskConfig( + name="wmt08:en-hu", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt08_en-hu", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt08_es_en_lighteval = LightevalTaskConfig( + name="wmt08:es-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt08_es-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt08_fr_en_lighteval = LightevalTaskConfig( + name="wmt08:fr-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt08_fr-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt08_hu_en_lighteval = LightevalTaskConfig( + name="wmt08:hu-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt08_hu-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt09_cs_en_lighteval = LightevalTaskConfig( + name="wmt09:cs-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt09_cs-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt09_de_en_lighteval = LightevalTaskConfig( + name="wmt09:de-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt09_de-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt09_en_cs_lighteval = LightevalTaskConfig( + name="wmt09:en-cs", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt09_en-cs", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt09_en_de_lighteval = LightevalTaskConfig( + name="wmt09:en-de", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt09_en-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt09_en_es_lighteval = LightevalTaskConfig( + name="wmt09:en-es", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt09_en-es", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt09_en_fr_lighteval = LightevalTaskConfig( + name="wmt09:en-fr", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt09_en-fr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt09_en_hu_lighteval = LightevalTaskConfig( + name="wmt09:en-hu", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt09_en-hu", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt09_en_it_lighteval = LightevalTaskConfig( + name="wmt09:en-it", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt09_en-it", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt09_es_en_lighteval = LightevalTaskConfig( + name="wmt09:es-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt09_es-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt09_fr_en_lighteval = LightevalTaskConfig( + name="wmt09:fr-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt09_fr-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt09_hu_en_lighteval = LightevalTaskConfig( + name="wmt09:hu-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt09_hu-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt09_it_en_lighteval = LightevalTaskConfig( + name="wmt09:it-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt09_it-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt10_cs_en_lighteval = LightevalTaskConfig( + name="wmt10:cs-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt10_cs-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt10_de_en_lighteval = LightevalTaskConfig( + name="wmt10:de-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt10_de-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt10_en_cs_lighteval = LightevalTaskConfig( + name="wmt10:en-cs", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt10_en-cs", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt10_en_de_lighteval = LightevalTaskConfig( + name="wmt10:en-de", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt10_en-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt10_en_es_lighteval = LightevalTaskConfig( + name="wmt10:en-es", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt10_en-es", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt10_en_fr_lighteval = LightevalTaskConfig( + name="wmt10:en-fr", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt10_en-fr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt10_es_en_lighteval = LightevalTaskConfig( + name="wmt10:es-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt10_es-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt10_fr_en_lighteval = LightevalTaskConfig( + name="wmt10:fr-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt10_fr-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt11_cs_en_lighteval = LightevalTaskConfig( + name="wmt11:cs-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt11_cs-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt11_de_en_lighteval = LightevalTaskConfig( + name="wmt11:de-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt11_de-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt11_en_cs_lighteval = LightevalTaskConfig( + name="wmt11:en-cs", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt11_en-cs", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt11_en_de_lighteval = LightevalTaskConfig( + name="wmt11:en-de", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt11_en-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt11_en_es_lighteval = LightevalTaskConfig( + name="wmt11:en-es", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt11_en-es", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt11_en_fr_lighteval = LightevalTaskConfig( + name="wmt11:en-fr", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt11_en-fr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt11_es_en_lighteval = LightevalTaskConfig( + name="wmt11:es-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt11_es-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt11_fr_en_lighteval = LightevalTaskConfig( + name="wmt11:fr-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt11_fr-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt12_cs_en_lighteval = LightevalTaskConfig( + name="wmt12:cs-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt12_cs-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt12_de_en_lighteval = LightevalTaskConfig( + name="wmt12:de-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt12_de-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt12_en_cs_lighteval = LightevalTaskConfig( + name="wmt12:en-cs", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt12_en-cs", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt12_en_de_lighteval = LightevalTaskConfig( + name="wmt12:en-de", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt12_en-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt12_en_es_lighteval = LightevalTaskConfig( + name="wmt12:en-es", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt12_en-es", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt12_en_fr_lighteval = LightevalTaskConfig( + name="wmt12:en-fr", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt12_en-fr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt12_es_en_lighteval = LightevalTaskConfig( + name="wmt12:es-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt12_es-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt12_fr_en_lighteval = LightevalTaskConfig( + name="wmt12:fr-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt12_fr-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt13_cs_en_lighteval = LightevalTaskConfig( + name="wmt13:cs-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt13_cs-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt13_de_en_lighteval = LightevalTaskConfig( + name="wmt13:de-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt13_de-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt13_en_cs_lighteval = LightevalTaskConfig( + name="wmt13:en-cs", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt13_en-cs", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt13_en_de_lighteval = LightevalTaskConfig( + name="wmt13:en-de", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt13_en-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt13_en_es_lighteval = LightevalTaskConfig( + name="wmt13:en-es", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt13_en-es", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt13_en_fr_lighteval = LightevalTaskConfig( + name="wmt13:en-fr", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt13_en-fr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt13_en_ru_lighteval = LightevalTaskConfig( + name="wmt13:en-ru", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt13_en-ru", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt13_es_en_lighteval = LightevalTaskConfig( + name="wmt13:es-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt13_es-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt13_fr_en_lighteval = LightevalTaskConfig( + name="wmt13:fr-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt13_fr-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt13_ru_en_lighteval = LightevalTaskConfig( + name="wmt13:ru-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt13_ru-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt14_cs_en_lighteval = LightevalTaskConfig( + name="wmt14:cs-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt14_cs-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt14_de_en_lighteval = LightevalTaskConfig( + name="wmt14:de-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt14_de-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt14_en_cs_lighteval = LightevalTaskConfig( + name="wmt14:en-cs", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt14_en-cs", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt14_en_de_lighteval = LightevalTaskConfig( + name="wmt14:en-de", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt14_en-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt14_en_fr_lighteval = LightevalTaskConfig( + name="wmt14:en-fr", + suite=["lighteval", "gpt3_benchmarks"], + prompt_function="wmt_alphabetical", + hf_repo="wmt14", + hf_subset="fr-en", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt14_en_fr_lighteval = LightevalTaskConfig( + name="wmt14:en-fr", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt14_en-fr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt14_en_hi_lighteval = LightevalTaskConfig( + name="wmt14:en-hi", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt14_en-hi", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt14_en_ru_lighteval = LightevalTaskConfig( + name="wmt14:en-ru", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt14_en-ru", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt14_fr_en_lighteval = LightevalTaskConfig( + name="wmt14:fr-en", + suite=["lighteval", "gpt3_benchmarks"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="wmt14", + hf_subset="fr-en", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt14_fr_en_lighteval = LightevalTaskConfig( + name="wmt14:fr-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt14_fr-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt14_hi_en_lighteval = LightevalTaskConfig( + name="wmt14:hi-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt14_hi-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt14_ru_en_lighteval = LightevalTaskConfig( + name="wmt14:ru-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt14_ru-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt14_cs_en_helm = LightevalTaskConfig( + name="wmt14:cs-en", + suite=["helm"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/wmt14", + hf_subset="cs-en", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["bleu"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt14_de_en_helm = LightevalTaskConfig( + name="wmt14:de-en", + suite=["helm"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/wmt14", + hf_subset="de-en", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["bleu"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt14_fr_en_helm = LightevalTaskConfig( + name="wmt14:fr-en", + suite=["helm"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/wmt14", + hf_subset="fr-en", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["bleu"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt14_hi_en_helm = LightevalTaskConfig( + name="wmt14:hi-en", + suite=["helm"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/wmt14", + hf_subset="hi-en", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["bleu"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt14_ru_en_helm = LightevalTaskConfig( + name="wmt14:ru-en", + suite=["helm"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/wmt14", + hf_subset="ru-en", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["bleu"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt15_cs_en_lighteval = LightevalTaskConfig( + name="wmt15:cs-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt15_cs-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt15_de_en_lighteval = LightevalTaskConfig( + name="wmt15:de-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt15_de-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt15_en_cs_lighteval = LightevalTaskConfig( + name="wmt15:en-cs", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt15_en-cs", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt15_en_de_lighteval = LightevalTaskConfig( + name="wmt15:en-de", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt15_en-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt15_en_fi_lighteval = LightevalTaskConfig( + name="wmt15:en-fi", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt15_en-fi", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt15_en_fr_lighteval = LightevalTaskConfig( + name="wmt15:en-fr", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt15_en-fr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt15_en_ru_lighteval = LightevalTaskConfig( + name="wmt15:en-ru", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt15_en-ru", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt15_fi_en_lighteval = LightevalTaskConfig( + name="wmt15:fi-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt15_fi-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt15_fr_en_lighteval = LightevalTaskConfig( + name="wmt15:fr-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt15_fr-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt15_ru_en_lighteval = LightevalTaskConfig( + name="wmt15:ru-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt15_ru-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt16_cs_en_lighteval = LightevalTaskConfig( + name="wmt16:cs-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt16_cs-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt16_de_en_lighteval = LightevalTaskConfig( + name="wmt16:de-en", + suite=["lighteval", "gpt3_benchmarks"], + prompt_function="wmt_alphabetical", + hf_repo="wmt16", + hf_subset="de-en", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt16_de_en_lighteval = LightevalTaskConfig( + name="wmt16:de-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt16_de-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt16_en_cs_lighteval = LightevalTaskConfig( + name="wmt16:en-cs", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt16_en-cs", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt16_en_de_lighteval = LightevalTaskConfig( + name="wmt16:en-de", + suite=["lighteval", "gpt3_benchmarks"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="wmt16", + hf_subset="de-en", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt16_en_de_lighteval = LightevalTaskConfig( + name="wmt16:en-de", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt16_en-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt16_en_fi_lighteval = LightevalTaskConfig( + name="wmt16:en-fi", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt16_en-fi", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt16_en_ro_lighteval = LightevalTaskConfig( + name="wmt16:en-ro", + suite=["lighteval", "gpt3_benchmarks"], + prompt_function="wmt_alphabetical", + hf_repo="wmt16", + hf_subset="ro-en", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt16_en_ro_lighteval = LightevalTaskConfig( + name="wmt16:en-ro", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt16_en-ro", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt16_en_ru_lighteval = LightevalTaskConfig( + name="wmt16:en-ru", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt16_en-ru", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt16_en_tr_lighteval = LightevalTaskConfig( + name="wmt16:en-tr", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt16_en-tr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt16_fi_en_lighteval = LightevalTaskConfig( + name="wmt16:fi-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt16_fi-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt16_ro_en_lighteval = LightevalTaskConfig( + name="wmt16:ro-en", + suite=["lighteval", "gpt3_benchmarks"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="wmt16", + hf_subset="ro-en", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt16_ro_en_lighteval = LightevalTaskConfig( + name="wmt16:ro-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt16_ro-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt16_ru_en_lighteval = LightevalTaskConfig( + name="wmt16:ru-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt16_ru-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt16_tr_en_lighteval = LightevalTaskConfig( + name="wmt16:tr-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt16_tr-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt17_cs_en_lighteval = LightevalTaskConfig( + name="wmt17:cs-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt17_cs-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt17_de_en_lighteval = LightevalTaskConfig( + name="wmt17:de-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt17_de-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt17_en_cs_lighteval = LightevalTaskConfig( + name="wmt17:en-cs", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt17_en-cs", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt17_en_de_lighteval = LightevalTaskConfig( + name="wmt17:en-de", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt17_en-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt17_en_fi_lighteval = LightevalTaskConfig( + name="wmt17:en-fi", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt17_en-fi", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt17_en_lv_lighteval = LightevalTaskConfig( + name="wmt17:en-lv", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt17_en-lv", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt17_en_ru_lighteval = LightevalTaskConfig( + name="wmt17:en-ru", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt17_en-ru", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt17_en_tr_lighteval = LightevalTaskConfig( + name="wmt17:en-tr", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt17_en-tr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt17_en_zh_lighteval = LightevalTaskConfig( + name="wmt17:en-zh", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt17_en-zh", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt17_fi_en_lighteval = LightevalTaskConfig( + name="wmt17:fi-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt17_fi-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt17_lv_en_lighteval = LightevalTaskConfig( + name="wmt17:lv-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt17_lv-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt17_ru_en_lighteval = LightevalTaskConfig( + name="wmt17:ru-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt17_ru-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt17_tr_en_lighteval = LightevalTaskConfig( + name="wmt17:tr-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt17_tr-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt17_zh_en_lighteval = LightevalTaskConfig( + name="wmt17:zh-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt17_zh-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt18_cs_en_lighteval = LightevalTaskConfig( + name="wmt18:cs-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt18_cs-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt18_de_en_lighteval = LightevalTaskConfig( + name="wmt18:de-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt18_de-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt18_en_cs_lighteval = LightevalTaskConfig( + name="wmt18:en-cs", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt18_en-cs", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt18_en_de_lighteval = LightevalTaskConfig( + name="wmt18:en-de", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt18_en-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt18_en_et_lighteval = LightevalTaskConfig( + name="wmt18:en-et", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt18_en-et", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt18_en_fi_lighteval = LightevalTaskConfig( + name="wmt18:en-fi", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt18_en-fi", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt18_en_ru_lighteval = LightevalTaskConfig( + name="wmt18:en-ru", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt18_en-ru", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt18_en_tr_lighteval = LightevalTaskConfig( + name="wmt18:en-tr", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt18_en-tr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt18_en_zh_lighteval = LightevalTaskConfig( + name="wmt18:en-zh", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt18_en-zh", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt18_et_en_lighteval = LightevalTaskConfig( + name="wmt18:et-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt18_et-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt18_fi_en_lighteval = LightevalTaskConfig( + name="wmt18:fi-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt18_fi-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt18_ru_en_lighteval = LightevalTaskConfig( + name="wmt18:ru-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt18_ru-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt18_tr_en_lighteval = LightevalTaskConfig( + name="wmt18:tr-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt18_tr-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt18_zh_en_lighteval = LightevalTaskConfig( + name="wmt18:zh-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt18_zh-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt19_cs_de_lighteval = LightevalTaskConfig( + name="wmt19:cs-de", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_cs-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt19_de_cs_lighteval = LightevalTaskConfig( + name="wmt19:de-cs", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_de-cs", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt19_de_en_lighteval = LightevalTaskConfig( + name="wmt19:de-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_de-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt19_de_fr_lighteval = LightevalTaskConfig( + name="wmt19:de-fr", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_de-fr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt19_en_cs_lighteval = LightevalTaskConfig( + name="wmt19:en-cs", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_en-cs", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt19_en_de_lighteval = LightevalTaskConfig( + name="wmt19:en-de", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_en-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt19_en_fi_lighteval = LightevalTaskConfig( + name="wmt19:en-fi", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_en-fi", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt19_en_gu_lighteval = LightevalTaskConfig( + name="wmt19:en-gu", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_en-gu", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt19_en_kk_lighteval = LightevalTaskConfig( + name="wmt19:en-kk", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_en-kk", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt19_en_lt_lighteval = LightevalTaskConfig( + name="wmt19:en-lt", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_en-lt", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt19_en_ru_lighteval = LightevalTaskConfig( + name="wmt19:en-ru", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_en-ru", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt19_en_zh_lighteval = LightevalTaskConfig( + name="wmt19:en-zh", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_en-zh", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt19_fi_en_lighteval = LightevalTaskConfig( + name="wmt19:fi-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_fi-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt19_fr_de_lighteval = LightevalTaskConfig( + name="wmt19:fr-de", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_fr-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt19_gu_en_lighteval = LightevalTaskConfig( + name="wmt19:gu-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_gu-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt19_kk_en_lighteval = LightevalTaskConfig( + name="wmt19:kk-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_kk-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt19_lt_en_lighteval = LightevalTaskConfig( + name="wmt19:lt-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_lt-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt19_ru_en_lighteval = LightevalTaskConfig( + name="wmt19:ru-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_ru-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt19_zh_en_lighteval = LightevalTaskConfig( + name="wmt19:zh-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_zh-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt20_cs_en_lighteval = LightevalTaskConfig( + name="wmt20:cs-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_cs-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt20_de_en_lighteval = LightevalTaskConfig( + name="wmt20:de-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_de-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt20_de_fr_lighteval = LightevalTaskConfig( + name="wmt20:de-fr", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_de-fr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt20_en_cs_lighteval = LightevalTaskConfig( + name="wmt20:en-cs", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_en-cs", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt20_en_de_lighteval = LightevalTaskConfig( + name="wmt20:en-de", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_en-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt20_en_iu_lighteval = LightevalTaskConfig( + name="wmt20:en-iu", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_en-iu", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt20_en_ja_lighteval = LightevalTaskConfig( + name="wmt20:en-ja", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_en-ja", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt20_en_km_lighteval = LightevalTaskConfig( + name="wmt20:en-km", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_en-km", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt20_en_pl_lighteval = LightevalTaskConfig( + name="wmt20:en-pl", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_en-pl", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt20_en_ps_lighteval = LightevalTaskConfig( + name="wmt20:en-ps", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_en-ps", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt20_en_ru_lighteval = LightevalTaskConfig( + name="wmt20:en-ru", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_en-ru", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt20_en_ta_lighteval = LightevalTaskConfig( + name="wmt20:en-ta", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_en-ta", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt20_en_zh_lighteval = LightevalTaskConfig( + name="wmt20:en-zh", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_en-zh", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt20_fr_de_lighteval = LightevalTaskConfig( + name="wmt20:fr-de", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_fr-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt20_iu_en_lighteval = LightevalTaskConfig( + name="wmt20:iu-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_iu-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt20_ja_en_lighteval = LightevalTaskConfig( + name="wmt20:ja-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_ja-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt20_km_en_lighteval = LightevalTaskConfig( + name="wmt20:km-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_km-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt20_pl_en_lighteval = LightevalTaskConfig( + name="wmt20:pl-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_pl-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt20_ps_en_lighteval = LightevalTaskConfig( + name="wmt20:ps-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_ps-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt20_ru_en_lighteval = LightevalTaskConfig( + name="wmt20:ru-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_ru-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt20_ta_en_lighteval = LightevalTaskConfig( + name="wmt20:ta-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_ta-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt20_zh_en_lighteval = LightevalTaskConfig( + name="wmt20:zh-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_zh-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +word_sorting_bigbench = LightevalTaskConfig( + name="word_sorting", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="word_sorting", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +word_unscrambling_bigbench = LightevalTaskConfig( + name="word_unscrambling", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="word_unscrambling", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wsc273_lighteval = LightevalTaskConfig( + name="wsc273", + suite=["lighteval"], + prompt_function="wsc273", + hf_repo="winograd_wsc", + hf_subset="wsc273", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xcopa_en_lighteval = LightevalTaskConfig( + name="xcopa:en", + suite=["lighteval"], + prompt_function="xcopa_en", + hf_repo="xcopa", + hf_subset="default", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xcopa_et_lighteval = LightevalTaskConfig( + name="xcopa:et", + suite=["lighteval"], + prompt_function="xcopa_et", + hf_repo="xcopa", + hf_subset="et", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xcopa_ht_lighteval = LightevalTaskConfig( + name="xcopa:ht", + suite=["lighteval"], + prompt_function="xcopa_ht", + hf_repo="xcopa", + hf_subset="ht", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xcopa_it_lighteval = LightevalTaskConfig( + name="xcopa:it", + suite=["lighteval"], + prompt_function="xcopa_it", + hf_repo="xcopa", + hf_subset="it", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xcopa_id_lighteval = LightevalTaskConfig( + name="xcopa:id", + suite=["lighteval"], + prompt_function="xcopa_id", + hf_repo="xcopa", + hf_subset="id", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xcopa_qu_lighteval = LightevalTaskConfig( + name="xcopa:qu", + suite=["lighteval"], + prompt_function="xcopa_qu", + hf_repo="xcopa", + hf_subset="qu", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xcopa_sw_lighteval = LightevalTaskConfig( + name="xcopa:sw", + suite=["lighteval"], + prompt_function="xcopa_sw", + hf_repo="xcopa", + hf_subset="sw", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xcopa_zh_lighteval = LightevalTaskConfig( + name="xcopa:zh", + suite=["lighteval"], + prompt_function="xcopa_zh", + hf_repo="xcopa", + hf_subset="zh", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xcopa_ta_lighteval = LightevalTaskConfig( + name="xcopa:ta", + suite=["lighteval"], + prompt_function="xcopa_ta", + hf_repo="xcopa", + hf_subset="ta", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xcopa_th_lighteval = LightevalTaskConfig( + name="xcopa:th", + suite=["lighteval"], + prompt_function="xcopa_th", + hf_repo="xcopa", + hf_subset="th", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xcopa_tr_lighteval = LightevalTaskConfig( + name="xcopa:tr", + suite=["lighteval"], + prompt_function="xcopa_tr", + hf_repo="xcopa", + hf_subset="tr", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xcopa_vi_lighteval = LightevalTaskConfig( + name="xcopa:vi", + suite=["lighteval"], + prompt_function="xcopa_vi", + hf_repo="xcopa", + hf_subset="vi", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xstory_cloze_en_lighteval = LightevalTaskConfig( + name="xstory_cloze:en", + suite=["lighteval"], + prompt_function="storycloze", + hf_repo="juletxara/xstory_cloze", + hf_subset="en", + hf_avail_splits=["training", "eval"], + evaluation_splits=["eval"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xstory_cloze_ru_lighteval = LightevalTaskConfig( + name="xstory_cloze:ru", + suite=["lighteval"], + prompt_function="storycloze", + hf_repo="juletxara/xstory_cloze", + hf_subset="ru", + hf_avail_splits=["training", "eval"], + evaluation_splits=["eval"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xstory_cloze_zh_lighteval = LightevalTaskConfig( + name="xstory_cloze:zh", + suite=["lighteval"], + prompt_function="storycloze", + hf_repo="juletxara/xstory_cloze", + hf_subset="zh", + hf_avail_splits=["training", "eval"], + evaluation_splits=["eval"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xstory_cloze_es_lighteval = LightevalTaskConfig( + name="xstory_cloze:es", + suite=["lighteval"], + prompt_function="storycloze", + hf_repo="juletxara/xstory_cloze", + hf_subset="es", + hf_avail_splits=["training", "eval"], + evaluation_splits=["eval"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xstory_cloze_ar_lighteval = LightevalTaskConfig( + name="xstory_cloze:ar", + suite=["lighteval"], + prompt_function="storycloze", + hf_repo="juletxara/xstory_cloze", + hf_subset="ar", + hf_avail_splits=["training", "eval"], + evaluation_splits=["eval"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xstory_cloze_hi_lighteval = LightevalTaskConfig( + name="xstory_cloze:hi", + suite=["lighteval"], + prompt_function="storycloze", + hf_repo="juletxara/xstory_cloze", + hf_subset="hi", + hf_avail_splits=["training", "eval"], + evaluation_splits=["eval"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xstory_cloze_id_lighteval = LightevalTaskConfig( + name="xstory_cloze:id", + suite=["lighteval"], + prompt_function="storycloze", + hf_repo="juletxara/xstory_cloze", + hf_subset="id", + hf_avail_splits=["training", "eval"], + evaluation_splits=["eval"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xstory_cloze_te_lighteval = LightevalTaskConfig( + name="xstory_cloze:te", + suite=["lighteval"], + prompt_function="storycloze", + hf_repo="juletxara/xstory_cloze", + hf_subset="te", + hf_avail_splits=["training", "eval"], + evaluation_splits=["eval"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xstory_cloze_sw_lighteval = LightevalTaskConfig( + name="xstory_cloze:sw", + suite=["lighteval"], + prompt_function="storycloze", + hf_repo="juletxara/xstory_cloze", + hf_subset="sw", + hf_avail_splits=["training", "eval"], + evaluation_splits=["eval"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xstory_cloze_eu_lighteval = LightevalTaskConfig( + name="xstory_cloze:eu", + suite=["lighteval"], + prompt_function="storycloze", + hf_repo="juletxara/xstory_cloze", + hf_subset="eu", + hf_avail_splits=["training", "eval"], + evaluation_splits=["eval"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xstory_cloze_my_lighteval = LightevalTaskConfig( + name="xstory_cloze:my", + suite=["lighteval"], + prompt_function="storycloze", + hf_repo="juletxara/xstory_cloze", + hf_subset="my", + hf_avail_splits=["training", "eval"], + evaluation_splits=["eval"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xwinograd_en_lighteval = LightevalTaskConfig( + name="xwinograd:en", + suite=["lighteval"], + prompt_function="winogrande", + hf_repo="Muennighoff/xwinograd", + hf_subset="en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xwinograd_fr_lighteval = LightevalTaskConfig( + name="xwinograd:fr", + suite=["lighteval"], + prompt_function="winogrande", + hf_repo="Muennighoff/xwinograd", + hf_subset="fr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xwinograd_jp_lighteval = LightevalTaskConfig( + name="xwinograd:jp", + suite=["lighteval"], + prompt_function="winogrande", + hf_repo="Muennighoff/xwinograd", + hf_subset="jp", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xwinograd_pt_lighteval = LightevalTaskConfig( + name="xwinograd:pt", + suite=["lighteval"], + prompt_function="winogrande", + hf_repo="Muennighoff/xwinograd", + hf_subset="pt", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xwinograd_ru_lighteval = LightevalTaskConfig( + name="xwinograd:ru", + suite=["lighteval"], + prompt_function="winogrande", + hf_repo="Muennighoff/xwinograd", + hf_subset="ru", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xwinograd_zh_lighteval = LightevalTaskConfig( + name="xwinograd:zh", + suite=["lighteval"], + prompt_function="winogrande", + hf_repo="Muennighoff/xwinograd", + hf_subset="zh", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) diff --git a/src/lighteval/tasks/extended/ifeval/main.py b/src/lighteval/tasks/extended/ifeval/main.py index 1faf55ee..c7290c3f 100644 --- a/src/lighteval/tasks/extended/ifeval/main.py +++ b/src/lighteval/tasks/extended/ifeval/main.py @@ -157,10 +157,8 @@ def agg_inst_level_acc(items): ) -_TASKS = [ifeval] +TASKS_TABLE = [ifeval] -# Convert to dict for lighteval -TASKS_TABLE = [task.as_dict() for task in _TASKS] extend_enum(Metrics, "ifeval_metric", ifeval_metrics) if __name__ == "__main__": diff --git a/src/lighteval/tasks/extended/mt_bench/main.py b/src/lighteval/tasks/extended/mt_bench/main.py index a0ce741c..4dfdeb41 100644 --- a/src/lighteval/tasks/extended/mt_bench/main.py +++ b/src/lighteval/tasks/extended/mt_bench/main.py @@ -71,9 +71,7 @@ def mt_bench_prompt(line, task_name: str = None): ) -_TASKS = [task] - -TASKS_TABLE = [task.as_dict() for task in _TASKS] +TASKS_TABLE = [task] if __name__ == "__main__": print(t["name"] for t in TASKS_TABLE) diff --git a/src/lighteval/tasks/extended/tiny_benchmarks/main.py b/src/lighteval/tasks/extended/tiny_benchmarks/main.py index a637f1a6..a8ce41a3 100644 --- a/src/lighteval/tasks/extended/tiny_benchmarks/main.py +++ b/src/lighteval/tasks/extended/tiny_benchmarks/main.py @@ -237,7 +237,7 @@ def aggregate(self, y_input): # }, ] -_TASKS = [] +TASKS_TABLE = [] for task in task_params: name = task["name"] generation_size = None @@ -259,7 +259,7 @@ def aggregate(self, y_input): generation_size=generation_size, stop_sequence=stop_sequence, ) - _TASKS.append(task) + TASKS_TABLE.append(task) # CUSTOM METRIC for task_param in task_params: @@ -288,8 +288,6 @@ def aggregate(self, y_input): # MODULE LOGIC # You should not need to touch this # Convert to dict for lighteval -TASKS_TABLE = [task.as_dict() for task in _TASKS] - if __name__ == "__main__": print(t["name"] for t in TASKS_TABLE) print(len(TASKS_TABLE)) diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py index fa70b61d..fa1b1d5a 100644 --- a/src/lighteval/tasks/lighteval_task.py +++ b/src/lighteval/tasks/lighteval_task.py @@ -30,6 +30,7 @@ from datasets import load_dataset +import lighteval.tasks.tasks_prompt_formatting as tasks_prompt_formatting from lighteval.few_shot_manager import FewShotSampler from lighteval.logging.hierarchical_logger import hlog, hlog_warn from lighteval.metrics import ( @@ -56,8 +57,6 @@ ) from lighteval.utils import NO_OPENAI_ERROR_MSG, as_list, is_openai_available -from . import tasks_prompt_formatting - if TYPE_CHECKING: from lighteval.logging.evaluation_tracker import EvaluationTracker @@ -115,25 +114,6 @@ class LightevalTaskConfig: version: int = 0 - def as_dict(self): - return { - "name": self.name, - "prompt_function": self.prompt_function, - "hf_repo": self.hf_repo, - "hf_subset": self.hf_subset, - "metric": tuple(str(m) for m in self.metric), - "hf_avail_splits": self.hf_avail_splits, - "evaluation_splits": self.evaluation_splits, - "few_shots_split": self.few_shots_split, - "few_shots_select": self.few_shots_select, - "generation_size": self.generation_size, - "stop_sequence": self.stop_sequence, - "output_regex": self.output_regex, - "frozen": self.frozen, - "suite": self.suite, - "version": self.version, - } - def __post_init__(self): if self.suite is None: self.suite = ["custom"] diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py index df5e4da6..ef575b7e 100644 --- a/src/lighteval/tasks/registry.py +++ b/src/lighteval/tasks/registry.py @@ -28,9 +28,9 @@ from types import ModuleType from typing import Dict, List, Optional, Tuple, Union -from datasets import Dataset from datasets.load import dataset_module_factory +import lighteval.tasks.default_tasks as default_tasks from lighteval.logging.hierarchical_logger import hlog, hlog_warn from lighteval.tasks.extended import AVAILABLE_EXTENDED_TASKS_MODULES from lighteval.tasks.lighteval_task import LightevalTask, LightevalTaskConfig @@ -57,8 +57,6 @@ TRUNCATE_FEW_SHOTS_DEFAULTS = True -TABLE_PATH = os.path.join(os.path.dirname(__file__), "tasks_table.jsonl") - class Registry: """ @@ -110,7 +108,7 @@ def get_task_class( ) def get_task_dict( - self, task_name_list: List[str], custom_tasks: Optional[Union[str, ModuleType]] = None + self, task_name_list: List[str], custom_tasks: Optional[Union[str, Path, ModuleType]] = None ) -> Dict[str, LightevalTask]: """ Get a dictionary of tasks based on the task name list. @@ -155,7 +153,7 @@ def get_task_dict( return tasks_dict -def create_custom_tasks_module(custom_tasks: Union[str, ModuleType]) -> ModuleType: +def create_custom_tasks_module(custom_tasks: Union[str, Path, ModuleType]) -> ModuleType: """Creates a custom task module to load tasks defined by the user in their own file. Args: @@ -234,15 +232,15 @@ def taskinfo_selector( def create_config_tasks( - meta_table: Optional[Dataset] = None, cache_dir: Optional[str] = None + meta_table: Optional[List[LightevalTaskConfig]] = None, cache_dir: Optional[str] = None ) -> Dict[str, LightevalTask]: """ Create configuration tasks based on the provided meta_table. Args: - meta_table (Optional[Dataset]): meta_table containing task + meta_table: meta_table containing tasks configurations. If not provided, it will be loaded from TABLE_PATH. - cache_dir (Optional[str]): Directory to store cached data. If not + cache_dir: Directory to store cached data. If not provided, the default cache directory will be used. Returns: @@ -257,18 +255,18 @@ def __init__(self, custom_tasks_module=None): return LightevalTaskFromConfig if meta_table is None: - meta_table = Dataset.from_json(TABLE_PATH) + meta_table = [config for config in vars(default_tasks).values() if isinstance(config, LightevalTaskConfig)] tasks_with_config = {} # Every task is renamed suite|task, if the suite is in DEFAULT_SUITE - for line in meta_table: - if not any(suite in line["suite"] for suite in DEFAULT_SUITES): + for config in meta_table: + if not any(suite in config.suite for suite in DEFAULT_SUITES): hlog_warn( - f"This evaluation is not in any known suite: {line['name']} is in {line['suite']}, not in {DEFAULT_SUITES}. Skipping." + f"This evaluation is not in any known suite: {config.name} is in {config.suite}, not in {DEFAULT_SUITES}. Skipping." ) continue - for suite in line["suite"]: + for suite in config.suite: if suite in DEFAULT_SUITES: - tasks_with_config[f"{suite}|{line['name']}"] = LightevalTaskConfig(**line) + tasks_with_config[f"{suite}|{config.name}"] = config return {task: create_task(task, cfg, cache_dir=cache_dir) for task, cfg in tasks_with_config.items()} diff --git a/src/lighteval/tasks/tasks_table.jsonl b/src/lighteval/tasks/tasks_table.jsonl deleted file mode 100644 index 0047ad5d..00000000 --- a/src/lighteval/tasks/tasks_table.jsonl +++ /dev/null @@ -1,1235 +0,0 @@ -{"name":"abstract_narrative_understanding","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"abstract_narrative_understanding","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"agieval:aqua-rat","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-aqua-rat","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"agieval:gaokao-biology","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-biology","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"agieval:gaokao-chemistry","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-chemistry","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"agieval:gaokao-chinese","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-chinese","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"agieval:gaokao-english","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-english","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"agieval:gaokao-geography","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-geography","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"agieval:gaokao-history","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-history","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"agieval:gaokao-mathqa","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-mathqa","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"agieval:gaokao-physics","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-physics","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"agieval:logiqa-en","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-logiqa-en","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"agieval:logiqa-zh","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-logiqa-zh","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"agieval:lsat-ar","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-lsat-ar","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"agieval:lsat-lr","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-lsat-lr","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"agieval:lsat-rc","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-lsat-rc","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"agieval:sat-en","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-sat-en","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"agieval:sat-en-without-passage","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-sat-en-without-passage","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"agieval:sat-math","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-sat-math","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"anachronisms","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"anachronisms","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"analogical_similarity","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"analogical_similarity","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"analytic_entailment","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"analytic_entailment","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"anli","suite":["lighteval","anli"],"prompt_function":"anli","hf_repo":"anli","hf_subset":"plain_text","hf_avail_splits":["train_r1","dev_r1","train_r2","dev_r2","train_r3","dev_r3","test_r1","test_r2","test_r3"],"evaluation_splits":["test_r1","test_r2","test_r3"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"anli:r1","suite":["lighteval","anli"],"prompt_function":"anli","hf_repo":"anli","hf_subset":"plain_text","hf_avail_splits":["train_r1","dev_r1","test_r1"],"evaluation_splits":["test_r1"],"few_shots_split":"train_r1","few_shots_select":"random_sampling_from_train","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"anli:r2","suite":["lighteval","anli"],"prompt_function":"anli","hf_repo":"anli","hf_subset":"plain_text","hf_avail_splits":["train_r2","dev_r2","test_r2"],"evaluation_splits":["test_r2"],"few_shots_split":"train_r2","few_shots_select":"random_sampling_from_train","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"anli:r3","suite":["lighteval","anli"],"prompt_function":"anli","hf_repo":"anli","hf_subset":"plain_text","hf_avail_splits":["train_r3","dev_r3","test_r3"],"evaluation_splits":["test_r3"],"few_shots_split":"train_r3","few_shots_select":"random_sampling_from_train","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"arc:c:letters","suite":["original","arc"],"prompt_function":"arc_with_options_letters_predict","hf_repo":"ai2_arc","hf_subset":"ARC-Challenge","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"arc:c:options","suite":["original","arc"],"prompt_function":"arc_with_options","hf_repo":"ai2_arc","hf_subset":"ARC-Challenge","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"arc:c:simple","suite":["original","arc"],"prompt_function":"arc","hf_repo":"ai2_arc","hf_subset":"ARC-Challenge","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"arc:challenge","suite":["leaderboard","arc"],"prompt_function":"arc","hf_repo":"ai2_arc","hf_subset":"ARC-Challenge","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling_from_train","generation_size":1,"metric":["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"arc:easy","suite":["lighteval","arc"],"prompt_function":"arc","hf_repo":"ai2_arc","hf_subset":"ARC-Easy","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling_from_train","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"arithmetic:1dc","suite":["lighteval","arithmetic"],"prompt_function":"arithmetic","hf_repo":"EleutherAI\/arithmetic","hf_subset":"arithmetic_1dc","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"arithmetic:2da","suite":["lighteval","arithmetic"],"prompt_function":"arithmetic","hf_repo":"EleutherAI\/arithmetic","hf_subset":"arithmetic_2da","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"arithmetic:2dm","suite":["lighteval","arithmetic"],"prompt_function":"arithmetic","hf_repo":"EleutherAI\/arithmetic","hf_subset":"arithmetic_2dm","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"arithmetic:2ds","suite":["lighteval","arithmetic"],"prompt_function":"arithmetic","hf_repo":"EleutherAI\/arithmetic","hf_subset":"arithmetic_2ds","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"arithmetic:3da","suite":["lighteval","arithmetic"],"prompt_function":"arithmetic","hf_repo":"EleutherAI\/arithmetic","hf_subset":"arithmetic_3da","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"arithmetic:3ds","suite":["lighteval","arithmetic"],"prompt_function":"arithmetic","hf_repo":"EleutherAI\/arithmetic","hf_subset":"arithmetic_3ds","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"arithmetic:4da","suite":["lighteval","arithmetic"],"prompt_function":"arithmetic","hf_repo":"EleutherAI\/arithmetic","hf_subset":"arithmetic_4da","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"arithmetic:4ds","suite":["lighteval","arithmetic"],"prompt_function":"arithmetic","hf_repo":"EleutherAI\/arithmetic","hf_subset":"arithmetic_4ds","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"arithmetic:5da","suite":["lighteval","arithmetic"],"prompt_function":"arithmetic","hf_repo":"EleutherAI\/arithmetic","hf_subset":"arithmetic_5da","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"arithmetic:5ds","suite":["lighteval","arithmetic"],"prompt_function":"arithmetic","hf_repo":"EleutherAI\/arithmetic","hf_subset":"arithmetic_5ds","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"arithmetic_bb","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"arithmetic","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"ascii_word_recognition","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"ascii_word_recognition","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"asdiv","suite":["lighteval"],"prompt_function":"asdiv","hf_repo":"EleutherAI\/asdiv","hf_subset":"asdiv","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"authorship_verification","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"authorship_verification","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"auto_categorization","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"auto_categorization","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"auto_debugging","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_and_after_query","hf_repo":"bigbench","hf_subset":"auto_debugging","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["perfect_exact_match"],"stop_sequence":null,"output_regex":"[^\\.\\?\\!\\;\\n]+", "trust_dataset": true,"version":0} -{"name":"babi_qa","suite":["helm"],"prompt_function":"babi_qa","hf_repo":"facebook\/babi_qa","hf_subset":"en-valid-qa1","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:causal_judgment","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"causal_judgement","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bigbench:date_understanding","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"date_understanding","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bigbench:disambiguation_qa","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"disambiguation_qa","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bigbench:geometric_shapes","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"geometric_shapes","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bigbench:logical_deduction_five_objects","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"logical_deduction_five_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bigbench:logical_deduction_seven_objects","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"logical_deduction_seven_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bigbench:logical_deduction_three_objects","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"logical_deduction_three_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bigbench:movie_recommendation","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"movie_recommendation","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bigbench:navigate","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"navigate","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bigbench:reasoning_about_colored_objects","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"reasoning_about_colored_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bigbench:ruin_names","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"ruin_names","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bigbench:salient_translation_error_detection","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"salient_translation_error_detection","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bigbench:snarks","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"snarks","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bigbench:sports_understanding","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"sports_understanding","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bigbench:temporal_sequences","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"temporal_sequences","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bigbench:tracking_shuffled_objects_five_objects","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"tracking_shuffled_objects_five_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bigbench:tracking_shuffled_objects_seven_objects","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"tracking_shuffled_objects_seven_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bigbench:tracking_shuffled_objects_three_objects","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"tracking_shuffled_objects_three_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bigbench:causal_judgment","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"causal_judgement","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} -{"name":"bigbench:date_understanding","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"date_understanding","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} -{"name":"bigbench:disambiguation_qa","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"disambiguation_qa","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} -{"name":"bigbench:geometric_shapes","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"geometric_shapes","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} -{"name":"bigbench:logical_deduction_five_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"logical_deduction_five_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} -{"name":"bigbench:logical_deduction_seven_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"logical_deduction_seven_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} -{"name":"bigbench:logical_deduction_three_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"logical_deduction_three_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} -{"name":"bigbench:movie_recommendation","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"movie_recommendation","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} -{"name":"bigbench:navigate","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"navigate","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} -{"name":"bigbench:reasoning_about_colored_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"reasoning_about_colored_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} -{"name":"bigbench:ruin_names","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"ruin_names","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} -{"name":"bigbench:salient_translation_error_detection","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"salient_translation_error_detection","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} -{"name":"bigbench:snarks","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"snarks","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} -{"name":"bigbench:sports_understanding","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"sports_understanding","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} -{"name":"bigbench:temporal_sequences","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"temporal_sequences","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} -{"name":"bigbench:tracking_shuffled_objects_five_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"tracking_shuffled_objects_five_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} -{"name":"bigbench:tracking_shuffled_objects_seven_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"tracking_shuffled_objects_seven_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} -{"name":"bigbench:tracking_shuffled_objects_three_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"tracking_shuffled_objects_three_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} -{"name":"bbh:boolean_expressions","suite":["harness"],"prompt_function":"bbh_boolean_expressions","hf_repo":"lukaemon/bbh","hf_subset":"boolean_expressions","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:causal_judgment","suite":["harness"],"prompt_function":"bbh_causal_judgment","hf_repo":"lukaemon/bbh","hf_subset":"causal_judgement","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:date_understanding","suite":["harness"],"prompt_function":"bbh_date_understanding","hf_repo":"lukaemon/bbh","hf_subset":"date_understanding","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:disambiguation_qa","suite":["harness"],"prompt_function":"bbh_disambiguation_qa","hf_repo":"lukaemon/bbh","hf_subset":"disambiguation_qa","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:dyck_languages","suite":["harness"],"prompt_function":"bbh_dyck_languages","hf_repo":"lukaemon/bbh","hf_subset":"dyck_languages","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:formal_fallacies","suite":["harness"],"prompt_function":"bbh_formal_fallacies","hf_repo":"lukaemon/bbh","hf_subset":"formal_fallacies","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:geometric_shapes","suite":["harness"],"prompt_function":"bbh_geometric_shapes","hf_repo":"lukaemon/bbh","hf_subset":"geometric_shapes","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:hyperbaton","suite":["harness"],"prompt_function":"bbh_hyperbaton","hf_repo":"lukaemon/bbh","hf_subset":"hyperbaton","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:logical_deduction_five_objects","suite":["harness"],"prompt_function":"bbh_logical_deduction_five_objects","hf_repo":"lukaemon/bbh","hf_subset":"logical_deduction_five_objects","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:logical_deduction_seven_objects","suite":["harness"],"prompt_function":"bbh_logical_deduction_seven_objects","hf_repo":"lukaemon/bbh","hf_subset":"logical_deduction_seven_objects","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:logical_deduction_three_objects","suite":["harness"],"prompt_function":"bbh_logical_deduction_three_objects","hf_repo":"lukaemon/bbh","hf_subset":"logical_deduction_three_objects","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:movie_recommendation","suite":["harness"],"prompt_function":"bbh_movie_recommendation","hf_repo":"lukaemon/bbh","hf_subset":"movie_recommendation","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:multistep_arithmetic_two","suite":["harness"],"prompt_function":"bbh_multistep_arithmetic_two","hf_repo":"lukaemon/bbh","hf_subset":"multistep_arithmetic_two","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:navigate","suite":["harness"],"prompt_function":"bbh_navigate","hf_repo":"lukaemon/bbh","hf_subset":"navigate","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:object_counting","suite":["harness"],"prompt_function":"bbh_object_counting","hf_repo":"lukaemon/bbh","hf_subset":"object_counting","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:penguins_in_a_table","suite":["harness"],"prompt_function":"bbh_penguins_in_a_table","hf_repo":"lukaemon/bbh","hf_subset":"penguins_in_a_table","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:reasoning_about_colored_objects","suite":["harness"],"prompt_function":"bbh_reasoning_about_colored_objects","hf_repo":"lukaemon/bbh","hf_subset":"reasoning_about_colored_objects","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:ruin_names","suite":["harness"],"prompt_function":"bbh_ruin_names","hf_repo":"lukaemon/bbh","hf_subset":"ruin_names","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:salient_translation_error_detection","suite":["harness"],"prompt_function":"bbh_salient_translation_error_detection","hf_repo":"lukaemon/bbh","hf_subset":"salient_translation_error_detection","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:snarks","suite":["harness"],"prompt_function":"bbh_snarks","hf_repo":"lukaemon/bbh","hf_subset":"snarks","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:sports_understanding","suite":["harness"],"prompt_function":"bbh_sports_understanding","hf_repo":"lukaemon/bbh","hf_subset":"sports_understanding","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:temporal_sequences","suite":["harness"],"prompt_function":"bbh_temporal_sequences","hf_repo":"lukaemon/bbh","hf_subset":"temporal_sequences","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:tracking_shuffled_objects_five_objects","suite":["harness"],"prompt_function":"bbh_tracking_shuffled_objects_five_objects","hf_repo":"lukaemon/bbh","hf_subset":"tracking_shuffled_objects_five_objects","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:tracking_shuffled_objects_seven_objects","suite":["harness"],"prompt_function":"bbh_tracking_shuffled_objects_seven_objects","hf_repo":"lukaemon/bbh","hf_subset":"tracking_shuffled_objects_seven_objects","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:tracking_shuffled_objects_three_objects","suite":["harness"],"prompt_function":"bbh_tracking_shuffled_objects_three_objects","hf_repo":"lukaemon/bbh","hf_subset":"tracking_shuffled_objects_three_objects","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:web_of_lies","suite":["harness"],"prompt_function":"bbh_web_of_lies","hf_repo":"lukaemon/bbh","hf_subset":"web_of_lies","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:word_sorting","suite":["harness"],"prompt_function":"bbh_word_sorting","hf_repo":"lukaemon/bbh","hf_subset":"word_sorting","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbq","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"all","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bbq:Age","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"Age","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bbq:Disability_status","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"Disability_status","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bbq:Gender_identity","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"Gender_identity","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bbq:Nationality","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"Nationality","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bbq:Physical_appearance","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"Physical_appearance","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bbq:Race_ethnicity","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"Race_ethnicity","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bbq:Race_x_SES","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"Race_x_SES","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bbq:Race_x_gender","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"Race_x_gender","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bbq:Religion","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"Religion","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bbq:SES","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"SES","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bbq:Sexual_orientation","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"Sexual_orientation","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bbq_lite_json","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_whitespace_after_query","hf_repo":"bigbench","hf_subset":"bbq_lite_json","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:auto_debugging","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"auto_debugging","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:bbq_lite_json:age_ambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-age_ambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:bbq_lite_json:age_disambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-age_disambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:bbq_lite_json:disability_status_ambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-disability_status_ambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:bbq_lite_json:disability_status_disambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-disability_status_disambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:bbq_lite_json:gender_identity_ambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-gender_identity_ambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:bbq_lite_json:gender_identity_disambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-gender_identity_disambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:bbq_lite_json:nationality_ambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-nationality_ambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:bbq_lite_json:nationality_disambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-nationality_disambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:bbq_lite_json:physical_appearance_ambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-physical_appearance_ambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:bbq_lite_json:physical_appearance_disambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-physical_appearance_disambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:bbq_lite_json:race_ethnicity_ambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-race_ethnicity_ambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:bbq_lite_json:race_ethnicity_disambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-race_ethnicity_disambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:bbq_lite_json:religion_ambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-religion_ambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:bbq_lite_json:religion_disambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-religion_disambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:bbq_lite_json:ses_ambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-ses_ambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:bbq_lite_json:ses_disambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-ses_disambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:bbq_lite_json:sexual_orientation_ambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-sexual_orientation_ambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:bbq_lite_json:sexual_orientation_disambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-sexual_orientation_disambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:code_line_description","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"code_line_description","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:conceptual_combinations:contradictions","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conceptual_combinations-contradictions","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:conceptual_combinations:emergent_properties","suite":["helm"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conceptual_combinations-emergent_properties","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:conceptual_combinations:fanciful_fictional_combinations","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conceptual_combinations-fanciful_fictional_combinations","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:conceptual_combinations:homonyms","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conceptual_combinations-homonyms","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:conceptual_combinations:invented_words","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conceptual_combinations-invented_words","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:conlang_translation:adna_from","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-adna_from","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:conlang_translation:adna_to","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-adna_to","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:conlang_translation:atikampe_from","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-atikampe_from","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:conlang_translation:atikampe_to","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-atikampe_to","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:conlang_translation:gornam_from","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-gornam_from","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:conlang_translation:gornam_to","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-gornam_to","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:conlang_translation:holuan_from","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-holuan_from","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:conlang_translation:holuan_to","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-holuan_to","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:conlang_translation:mkafala_from","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-mkafala_from","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:conlang_translation:mkafala_to","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-mkafala_to","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:conlang_translation:postpositive_english_from","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-postpositive_english_from","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:conlang_translation:postpositive_english_to","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-postpositive_english_to","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:conlang_translation:unapuri_from","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-unapuri_from","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:conlang_translation:unapuri_to","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-unapuri_to","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:conlang_translation:vaomi_from","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-vaomi_from","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:conlang_translation:vaomi_to","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-vaomi_to","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:emoji_movie","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"emoji_movie","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:formal_fallacies_syllogisms_negation","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"formal_fallacies_syllogisms_negation","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:hindu_knowledge","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"hindu_knowledge","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:known_unknowns","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"known_unknowns","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:language_identification","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"language_identification","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:linguistics_puzzles","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"linguistics_puzzles","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:logic_grid_puzzle","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"logic_grid_puzzle","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:logical_deduction-five_objects","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"logical_deduction-five_objects","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:logical_deduction-seven_objects","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"logical_deduction-seven_objects","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:logical_deduction-three_objects","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"logical_deduction-three_objects","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:misconceptions_russian","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"misconceptions_russian","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:novel_concepts","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"novel_concepts","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:operators","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"operators","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:parsinlu_reading_comprehension","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"parsinlu_reading_comprehension","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:play_dialog_same_or_different","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"play_dialog_same_or_different","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:repeat_copy_logic","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"repeat_copy_logic","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:strange_stories-boolean","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"strange_stories-boolean","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:strange_stories-multiple_choice","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"strange_stories-multiple_choice","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:strategyqa","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"strategyqa","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:symbol_interpretation-adversarial","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"symbol_interpretation-adversarial","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:symbol_interpretation-emoji_agnostic","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"symbol_interpretation-emoji_agnostic","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:symbol_interpretation-name_agnostic","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"symbol_interpretation-name_agnostic","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:symbol_interpretation-plain","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"symbol_interpretation-plain","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:symbol_interpretation-tricky","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"symbol_interpretation-tricky","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:vitaminc_fact_verification","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"vitaminc_fact_verification","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:winowhy","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"winowhy","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:adjunct_island","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"adjunct_island","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:adjunct_island","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"adjunct_island","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:anaphor_gender_agreement","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"anaphor_gender_agreement","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:anaphor_gender_agreement","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"anaphor_gender_agreement","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:anaphor_number_agreement","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"anaphor_number_agreement","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:anaphor_number_agreement","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"anaphor_number_agreement","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:animate_subject_passive","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"animate_subject_passive","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:animate_subject_passive","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"animate_subject_passive","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:animate_subject_trans","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"animate_subject_trans","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:animate_subject_trans","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"animate_subject_trans","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:causative","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"causative","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:causative","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"causative","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:complex_NP_island","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"complex_NP_island","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:complex_NP_island","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"complex_NP_island","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:coordinate_structure_constraint_complex_left_branch","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"coordinate_structure_constraint_complex_left_branch","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:coordinate_structure_constraint_complex_left_branch","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"coordinate_structure_constraint_complex_left_branch","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:coordinate_structure_constraint_object_extraction","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"coordinate_structure_constraint_object_extraction","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:coordinate_structure_constraint_object_extraction","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"coordinate_structure_constraint_object_extraction","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:determiner_noun_agreement_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:determiner_noun_agreement_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:determiner_noun_agreement_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:determiner_noun_agreement_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:determiner_noun_agreement_irregular_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_irregular_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:determiner_noun_agreement_irregular_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_irregular_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:determiner_noun_agreement_irregular_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_irregular_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:determiner_noun_agreement_irregular_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_irregular_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:determiner_noun_agreement_with_adj_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_with_adj_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:determiner_noun_agreement_with_adj_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_with_adj_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:determiner_noun_agreement_with_adj_irregular_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_with_adj_irregular_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:determiner_noun_agreement_with_adj_irregular_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_with_adj_irregular_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:determiner_noun_agreement_with_adj_irregular_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_with_adj_irregular_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:determiner_noun_agreement_with_adj_irregular_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_with_adj_irregular_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:determiner_noun_agreement_with_adjective_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_with_adjective_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:determiner_noun_agreement_with_adjective_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_with_adjective_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:distractor_agreement_relational_noun","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"distractor_agreement_relational_noun","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:distractor_agreement_relational_noun","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"distractor_agreement_relational_noun","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:distractor_agreement_relative_clause","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"distractor_agreement_relative_clause","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:distractor_agreement_relative_clause","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"distractor_agreement_relative_clause","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:drop_argument","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"drop_argument","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:drop_argument","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"drop_argument","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:ellipsis_n_bar_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"ellipsis_n_bar_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:ellipsis_n_bar_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"ellipsis_n_bar_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:ellipsis_n_bar_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"ellipsis_n_bar_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:ellipsis_n_bar_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"ellipsis_n_bar_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:existential_there_object_raising","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"existential_there_object_raising","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:existential_there_object_raising","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"existential_there_object_raising","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:existential_there_quantifiers_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"existential_there_quantifiers_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:existential_there_quantifiers_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"existential_there_quantifiers_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:existential_there_quantifiers_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"existential_there_quantifiers_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:existential_there_quantifiers_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"existential_there_quantifiers_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:existential_there_subject_raising","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"existential_there_subject_raising","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:existential_there_subject_raising","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"existential_there_subject_raising","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:expletive_it_object_raising","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"expletive_it_object_raising","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:expletive_it_object_raising","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"expletive_it_object_raising","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:inchoative","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"inchoative","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:inchoative","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"inchoative","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:intransitive","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"intransitive","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:intransitive","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"intransitive","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:irregular_past_participle_adjectives","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"irregular_past_participle_adjectives","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:irregular_past_participle_adjectives","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"irregular_past_participle_adjectives","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:irregular_past_participle_verbs","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"irregular_past_participle_verbs","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:irregular_past_participle_verbs","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"irregular_past_participle_verbs","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:irregular_plural_subject_verb_agreement_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"irregular_plural_subject_verb_agreement_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:irregular_plural_subject_verb_agreement_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"irregular_plural_subject_verb_agreement_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:irregular_plural_subject_verb_agreement_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"irregular_plural_subject_verb_agreement_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:irregular_plural_subject_verb_agreement_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"irregular_plural_subject_verb_agreement_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:left_branch_island_echo_question","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"left_branch_island_echo_question","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:left_branch_island_echo_question","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"left_branch_island_echo_question","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:left_branch_island_simple_question","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"left_branch_island_simple_question","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:left_branch_island_simple_question","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"left_branch_island_simple_question","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:matrix_question_npi_licensor_present","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"matrix_question_npi_licensor_present","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:matrix_question_npi_licensor_present","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"matrix_question_npi_licensor_present","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:npi_present_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"npi_present_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:npi_present_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"npi_present_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:npi_present_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"npi_present_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:npi_present_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"npi_present_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:only_npi_licensor_present","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"only_npi_licensor_present","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:only_npi_licensor_present","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"only_npi_licensor_present","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:only_npi_scope","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"only_npi_scope","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:only_npi_scope","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"only_npi_scope","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:passive_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"passive_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:passive_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"passive_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:passive_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"passive_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:passive_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"passive_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:principle_A_c_command","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"principle_A_c_command","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:principle_A_c_command","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"principle_A_c_command","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:principle_A_case_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"principle_A_case_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:principle_A_case_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"principle_A_case_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:principle_A_case_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"principle_A_case_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:principle_A_case_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"principle_A_case_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:principle_A_domain_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"principle_A_domain_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:principle_A_domain_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"principle_A_domain_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:principle_A_domain_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"principle_A_domain_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:principle_A_domain_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"principle_A_domain_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:principle_A_domain_3","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"principle_A_domain_3","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:principle_A_domain_3","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"principle_A_domain_3","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:principle_A_reconstruction","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"principle_A_reconstruction","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:principle_A_reconstruction","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"principle_A_reconstruction","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:regular_plural_subject_verb_agreement_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"regular_plural_subject_verb_agreement_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:regular_plural_subject_verb_agreement_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"regular_plural_subject_verb_agreement_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:regular_plural_subject_verb_agreement_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"regular_plural_subject_verb_agreement_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:regular_plural_subject_verb_agreement_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"regular_plural_subject_verb_agreement_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:sentential_negation_npi_licensor_present","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"sentential_negation_npi_licensor_present","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:sentential_negation_npi_licensor_present","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"sentential_negation_npi_licensor_present","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:sentential_negation_npi_scope","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"sentential_negation_npi_scope","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:sentential_negation_npi_scope","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"sentential_negation_npi_scope","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:sentential_subject_island","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"sentential_subject_island","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:sentential_subject_island","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"sentential_subject_island","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:superlative_quantifiers_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"superlative_quantifiers_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:superlative_quantifiers_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"superlative_quantifiers_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:superlative_quantifiers_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"superlative_quantifiers_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:superlative_quantifiers_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"superlative_quantifiers_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:tough_vs_raising_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"tough_vs_raising_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:tough_vs_raising_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"tough_vs_raising_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:tough_vs_raising_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"tough_vs_raising_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:tough_vs_raising_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"tough_vs_raising_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:transitive","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"transitive","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:transitive","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"transitive","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:wh_island","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"wh_island","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:wh_island","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"wh_island","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:wh_questions_object_gap","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"wh_questions_object_gap","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:wh_questions_object_gap","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"wh_questions_object_gap","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:wh_questions_subject_gap","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"wh_questions_subject_gap","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:wh_questions_subject_gap","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"wh_questions_subject_gap","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:wh_questions_subject_gap_long_distance","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"wh_questions_subject_gap_long_distance","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:wh_questions_subject_gap_long_distance","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"wh_questions_subject_gap_long_distance","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:wh_vs_that_no_gap","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"wh_vs_that_no_gap","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:wh_vs_that_no_gap","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"wh_vs_that_no_gap","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:wh_vs_that_no_gap_long_distance","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"wh_vs_that_no_gap_long_distance","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:wh_vs_that_no_gap_long_distance","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"wh_vs_that_no_gap_long_distance","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:wh_vs_that_with_gap","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"wh_vs_that_with_gap","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:wh_vs_that_with_gap","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"wh_vs_that_with_gap","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:wh_vs_that_with_gap_long_distance","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"wh_vs_that_with_gap_long_distance","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:wh_vs_that_with_gap_long_distance","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"wh_vs_that_with_gap_long_distance","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bold","suite":["helm"],"prompt_function":"bold","hf_repo":"lighteval\/bold_helm","hf_subset":"all","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["prediction_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bold:gender","suite":["helm"],"prompt_function":"bold","hf_repo":"lighteval\/bold_helm","hf_subset":"gender","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["prediction_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bold:political_ideology","suite":["helm"],"prompt_function":"bold","hf_repo":"lighteval\/bold_helm","hf_subset":"political_ideology","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["prediction_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bold:profession","suite":["helm"],"prompt_function":"bold","hf_repo":"lighteval\/bold_helm","hf_subset":"profession","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["prediction_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bold:race","suite":["helm"],"prompt_function":"bold","hf_repo":"lighteval\/bold_helm","hf_subset":"race","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["prediction_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bold:religious_ideology","suite":["helm"],"prompt_function":"bold","hf_repo":"lighteval\/bold_helm","hf_subset":"religious_ideology","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["prediction_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"boolq","suite":["helm","helm_general"],"prompt_function":"boolq_helm","hf_repo":"lighteval\/boolq_helm","hf_subset":"default","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"boolq:contrastset","suite":["helm"],"prompt_function":"boolq_helm_contrastset","hf_repo":"lighteval\/boolq_helm","hf_subset":"default","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bridging_anaphora_resolution_barqa","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"bridging_anaphora_resolution_barqa","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"causal_judgment","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"causal_judgment","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"cause_and_effect","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"cause_and_effect","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"checkmate_in_one","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"checkmate_in_one","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"chess_state_tracking","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"chess_state_tracking","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"chinese_remainder_theorem","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"chinese_remainder_theorem","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"cifar10_classification","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"cifar10_classification","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"civil_comments","suite":["helm","helm_general"],"prompt_function":"civil_comments","hf_repo":"lighteval\/civil_comments_helm","hf_subset":"all","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"civil_comments:LGBTQ","suite":["helm"],"prompt_function":"civil_comments","hf_repo":"lighteval\/civil_comments_helm","hf_subset":"LGBTQ","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"civil_comments:black","suite":["helm"],"prompt_function":"civil_comments","hf_repo":"lighteval\/civil_comments_helm","hf_subset":"black","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"civil_comments:christian","suite":["helm"],"prompt_function":"civil_comments","hf_repo":"lighteval\/civil_comments_helm","hf_subset":"christian","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"civil_comments:female","suite":["helm"],"prompt_function":"civil_comments","hf_repo":"lighteval\/civil_comments_helm","hf_subset":"female","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"civil_comments:male","suite":["helm"],"prompt_function":"civil_comments","hf_repo":"lighteval\/civil_comments_helm","hf_subset":"male","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"civil_comments:muslim","suite":["helm"],"prompt_function":"civil_comments","hf_repo":"lighteval\/civil_comments_helm","hf_subset":"muslim","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"civil_comments:other_religions","suite":["helm"],"prompt_function":"civil_comments","hf_repo":"lighteval\/civil_comments_helm","hf_subset":"other_religions","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"civil_comments:white","suite":["helm"],"prompt_function":"civil_comments","hf_repo":"lighteval\/civil_comments_helm","hf_subset":"white","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"code_line_description","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_and_after_query","hf_repo":"bigbench","hf_subset":"code_line_description","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"codenames","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"codenames","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["rouge_t5","bleu","bleu"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"color","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"color","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["rouge_t5","bleu","loglikelihood_acc","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"common_morpheme","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"common_morpheme","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"commonsenseqa","suite":["helm","commonsense_scenario"],"prompt_function":"commonsense_qa","hf_repo":"commonsense_qa","hf_subset":"default","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"conceptual_combinations","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_whitespace_after_query","hf_repo":"bigbench","hf_subset":"conceptual_combinations","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"conlang_translation","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_whitespace_after_query","hf_repo":"bigbench","hf_subset":"conlang_translation","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge_t5","bleu","perfect_exact_match"],"stop_sequence":[".",";","!","?"],"output_regex":"[^\\.\\?\\!\\;\\n]+", "trust_dataset": true,"version":0} -{"name":"contextual_parametric_knowledge_conflicts","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"contextual_parametric_knowledge_conflicts","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["rouge_t5","loglikelihood_acc","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"copyright:n_books_1000-extractions_per_book_1-prefix_length_125","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"n_books_1000-extractions_per_book_1-prefix_length_125","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"copyright:n_books_1000-extractions_per_book_1-prefix_length_25","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"n_books_1000-extractions_per_book_1-prefix_length_25","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"copyright:n_books_1000-extractions_per_book_1-prefix_length_5","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"n_books_1000-extractions_per_book_1-prefix_length_5","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"copyright:n_books_1000-extractions_per_book_3-prefix_length_125","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"n_books_1000-extractions_per_book_3-prefix_length_125","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"copyright:n_books_1000-extractions_per_book_3-prefix_length_25","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"n_books_1000-extractions_per_book_3-prefix_length_25","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"copyright:n_books_1000-extractions_per_book_3-prefix_length_5","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"n_books_1000-extractions_per_book_3-prefix_length_5","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"copyright:oh_the_places","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"oh_the_places","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"copyright:pilot","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"pilot","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"copyright:popular_books-prefix_length_10","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"popular_books-prefix_length_10","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"copyright:popular_books-prefix_length_125","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"popular_books-prefix_length_125","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"copyright:popular_books-prefix_length_25","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"popular_books-prefix_length_25","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"copyright:popular_books-prefix_length_250","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"popular_books-prefix_length_250","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"copyright:popular_books-prefix_length_5","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"popular_books-prefix_length_5","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"copyright:popular_books-prefix_length_50","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"popular_books-prefix_length_50","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"copyright:prompt_num_line_1-min_lines_20","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"prompt_num_line_1-min_lines_20","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"copyright:prompt_num_line_10-min_lines_20","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"prompt_num_line_10-min_lines_20","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"copyright:prompt_num_line_5-min_lines_20","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"prompt_num_line_5-min_lines_20","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"coqa","suite":["lighteval"],"prompt_function":"coqa","hf_repo":"coqa","hf_subset":"default","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["perfect_exact_match","f1_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"coqa_bb","suite":["lighteval","bigbench_programmatic","bigbench"],"prompt_function":"coqa","hf_repo":"coqa","hf_subset":"default","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["perfect_exact_match","f1_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"covid_dialogue","suite":["helm"],"prompt_function":"covid_dialogue","hf_repo":"lighteval\/covid_dialogue","hf_subset":"default","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":128,"metric":["exact_match","quasi_exact_match","f1_score","rougeL","bleu_1","bleu_4"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"crash_blossom","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"crash_blossom","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"crass_ai","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"crass_ai","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"cryobiology_spanish","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"cryobiology_spanish","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"cryptonite","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"cryptonite","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"cs_algorithms","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"cs_algorithms","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"dark_humor_detection","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"dark_humor_detection","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"date_understanding","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"date_understanding","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"disambiguation_qa","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"disambiguation_qa","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"discourse_marker_prediction","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"discourse_marker_prediction","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"disfl_qa","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"disfl_qa","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"drop","suite":["lighteval"],"prompt_function":"drop","hf_repo":"lighteval/drop_harness","hf_subset":"default","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":"train","few_shots_select":"random_sampling_from_train","generation_size":null,"metric":["drop"],"stop_sequence":["."],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"dyck_language:2","suite":["helm"],"prompt_function":"dyck_language","hf_repo":"lighteval\/DyckLanguage","hf_subset":"2","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"dyck_language:3","suite":["helm"],"prompt_function":"dyck_language","hf_repo":"lighteval\/DyckLanguage","hf_subset":"3","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"dyck_language:4","suite":["helm"],"prompt_function":"dyck_language","hf_repo":"lighteval\/DyckLanguage","hf_subset":"4","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"dyck_languages","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"dyck_languages","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"elementary_math_qa","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"elementary_math_qa","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"emoji_movie","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_whitespace_after_query","hf_repo":"bigbench","hf_subset":"emoji_movie","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["rouge_t5","bleu","loglikelihood_acc","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"emojis_emotion_prediction","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"emojis_emotion_prediction","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"empirical_judgments","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"empirical_judgments","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"english_proverbs","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"english_proverbs","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"english_russian_proverbs","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"english_russian_proverbs","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"entailed_polarity","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"entailed_polarity","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"entailed_polarity_hindi","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"entailed_polarity_hindi","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"entity_data_imputation:Buy","suite":["helm"],"prompt_function":"entity_data_imputation","hf_repo":"lighteval\/Buy","hf_subset":"default","hf_avail_splits":["train","test","valid"],"evaluation_splits":["valid","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"entity_data_imputation:Restaurant","suite":["helm"],"prompt_function":"entity_data_imputation","hf_repo":"lighteval\/Restaurant","hf_subset":"default","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"entity_matching:Abt_Buy","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"Abt_Buy","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"entity_matching:Amazon_Google","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"Amazon_Google","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"entity_matching:Beer","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"Beer","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"entity_matching:Company","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"Company","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"entity_matching:DBLP_ACM","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"DBLP_ACM","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"entity_matching:DBLP_GoogleScholar","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"DBLP_GoogleScholar","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"entity_matching:Dirty_DBLP_ACM","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"Dirty_DBLP_ACM","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"entity_matching:Dirty_DBLP_GoogleScholar","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"Dirty_DBLP_GoogleScholar","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"entity_matching:Dirty_Walmart_Amazon","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"Dirty_Walmart_Amazon","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"entity_matching:Dirty_iTunes_Amazon","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"Dirty_iTunes_Amazon","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"entity_matching:Fodors_Zagats","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"Fodors_Zagats","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"entity_matching:Walmart_Amazon","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"Walmart_Amazon","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"entity_matching:iTunes_Amazon","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"iTunes_Amazon","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"epistemic_reasoning","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"epistemic_reasoning","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"ethics:commonsense","suite":["lighteval","ethics"],"prompt_function":"ethics_commonsense","hf_repo":"lighteval\/hendrycks_ethics","hf_subset":"commonsense","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"ethics:deontology","suite":["lighteval","ethics"],"prompt_function":"ethics_deontology","hf_repo":"lighteval\/hendrycks_ethics","hf_subset":"deontology","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"ethics:justice","suite":["lighteval","ethics"],"prompt_function":"ethics_justice","hf_repo":"lighteval\/hendrycks_ethics","hf_subset":"justice","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"ethics:utilitarianism","suite":["lighteval","ethics"],"prompt_function":"ethics_utilitarianism","hf_repo":"lighteval\/hendrycks_ethics","hf_subset":"utilitarianism","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"ethics:virtue","suite":["lighteval","ethics"],"prompt_function":"ethics_virtue","hf_repo":"lighteval\/hendrycks_ethics","hf_subset":"virtue","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"evaluating_information_essentiality","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"evaluating_information_essentiality","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"fact_checker","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"fact_checker","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"fantasy_reasoning","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"fantasy_reasoning","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"few_shot_nlg","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"few_shot_nlg","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","bleurt"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"figure_of_speech_detection","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"figure_of_speech_detection","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"formal_fallacies_syllogisms_negation","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_whitespace_after_query","hf_repo":"bigbench","hf_subset":"formal_fallacies_syllogisms_negation","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"gem","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"gem","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","rouge_t5"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"gender_inclusive_sentences_german","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"gender_inclusive_sentences_german","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"general_knowledge","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"general_knowledge","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"geometric_shapes","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"geometric_shapes","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["rouge_t5","bleu","loglikelihood_acc","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"glue:cola","suite":["lighteval","glue"],"prompt_function":"cola","hf_repo":"glue","hf_subset":"cola","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc_single_token", "mcc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"glue:mnli","suite":["lighteval","glue"],"prompt_function":"mnli","hf_repo":"glue","hf_subset":"mnli_matched","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"glue:mnli_mismatched","suite":["lighteval","glue"],"prompt_function":"mnli","hf_repo":"glue","hf_subset":"mnli_mismatched","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"glue:mrpc","suite":["lighteval","glue"],"prompt_function":"mrpc","hf_repo":"glue","hf_subset":"mrpc","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc", "loglikelihood_f1"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"glue:qnli","suite":["lighteval","glue"],"prompt_function":"qnli","hf_repo":"glue","hf_subset":"qnli","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"glue:qqp","suite":["lighteval","glue"],"prompt_function":"qqp","hf_repo":"glue","hf_subset":"qqp","hf_avail_splits":["train","validation","test"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc", "loglikelihood_f1"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"glue:rte","suite":["lighteval","glue"],"prompt_function":"rte","hf_repo":"glue","hf_subset":"rte","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"glue:sst2","suite":["lighteval","glue"],"prompt_function":"sst","hf_repo":"glue","hf_subset":"sst2","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"glue:stsb","suite":["lighteval","glue"],"prompt_function":"stsb","hf_repo":"glue","hf_subset":"stsb","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"glue:wnli","suite":["lighteval","glue"],"prompt_function":"wnli","hf_repo":"glue","hf_subset":"wnli","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"goal_step_wikihow","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"goal_step_wikihow","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"gpqa","suite":["lighteval"],"prompt_function":"gpqa","hf_repo":"Idavidrein/gpqa","hf_subset":"gpqa_main","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"gre_reading_comprehension","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"gre_reading_comprehension","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"gsm8k","suite":["leaderboard"],"prompt_function":"gsm8k","hf_repo":"gsm8k","hf_subset":"main","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling_from_train","generation_size":256,"metric":["quasi_exact_match_gsm8k"],"stop_sequence":["Question:","Question",":"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"gsm8k","suite":["lighteval"],"prompt_function":"gsm8k","hf_repo":"gsm8k","hf_subset":"main","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling_from_train","generation_size":256,"metric":["quasi_exact_match_gsm8k","maj_at_8_gsm8k"],"stop_sequence":["Question:"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"headqa:en","suite":["lighteval","headqa"],"prompt_function":"headqa","hf_repo":"lighteval/headqa_harness","hf_subset":"en","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"headqa:es","suite":["lighteval","headqa"],"prompt_function":"headqa","hf_repo":"lighteval/headqa_harness","hf_subset":"es","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"hellaswag","suite":["leaderboard"],"prompt_function":"hellaswag_harness","hf_repo":"hellaswag","hf_subset":"default","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":"random_sampling_from_train","generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"hellaswag","suite":["helm","helm_general"],"prompt_function":"hellaswag_helm","hf_repo":"hellaswag","hf_subset":"default","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"hhh_alignment","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"hhh_alignment","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"hindi_question_answering","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"hindi_question_answering","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","rouge_t5","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"hindu_knowledge","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_whitespace_after_query","hf_repo":"bigbench","hf_subset":"hindu_knowledge","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"hinglish_toxicity","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"hinglish_toxicity","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"human_organs_senses","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"human_organs_senses","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"humaneval","suite":["helm","code_scenario"],"prompt_function":"humaneval","hf_repo":"openai_humaneval","hf_subset":"openai_humaneval","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":600,"metric":["code_humaneval"],"stop_sequence":["\nclass","\ndef","\nif","\nprint"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"hyperbaton","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"hyperbaton","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"identify_math_theorems","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"identify_math_theorems","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"identify_odd_metaphor","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"identify_odd_metaphor","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"imdb","suite":["helm","helm_general"],"prompt_function":"imdb","hf_repo":"lighteval\/IMDB_helm","hf_subset":"default","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"imdb:contrastset","suite":["helm"],"prompt_function":"imdb_contrastset","hf_repo":"lighteval\/IMDB_helm","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"implicatures","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"implicatures","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"implicit_relations","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"implicit_relations","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"intent_recognition","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"intent_recognition","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"interactive_qa_mmlu:abstract_algebra","suite":["helm","interactive_qa_mmlu_scenario"],"prompt_function":"mmlu_qa_abstract_algebra","hf_repo":"lighteval\/mmlu","hf_subset":"abstract_algebra","hf_avail_splits":["dev","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"interactive_qa_mmlu:college_chemistry","suite":["helm","interactive_qa_mmlu_scenario"],"prompt_function":"mmlu_qa_college_chemistry","hf_repo":"lighteval\/mmlu","hf_subset":"college_chemistry","hf_avail_splits":["dev","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"interactive_qa_mmlu:global_facts","suite":["helm","interactive_qa_mmlu_scenario"],"prompt_function":"mmlu_qa_global_facts","hf_repo":"lighteval\/mmlu","hf_subset":"global_facts","hf_avail_splits":["dev","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"interactive_qa_mmlu:miscellaneous","suite":["helm","interactive_qa_mmlu_scenario"],"prompt_function":"mmlu_qa_miscellaneous","hf_repo":"lighteval\/mmlu","hf_subset":"miscellaneous","hf_avail_splits":["dev","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"interactive_qa_mmlu:nutrition","suite":["helm","interactive_qa_mmlu_scenario"],"prompt_function":"mmlu_qa_nutrition","hf_repo":"lighteval\/mmlu","hf_subset":"nutrition","hf_avail_splits":["dev","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"interactive_qa_mmlu:us_foreign_policy","suite":["helm","interactive_qa_mmlu_scenario"],"prompt_function":"mmlu_qa_us_foreign_policy","hf_repo":"lighteval\/mmlu","hf_subset":"us_foreign_policy","hf_avail_splits":["dev","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"international_phonetic_alphabet_nli","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"international_phonetic_alphabet_nli","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"international_phonetic_alphabet_transliterate","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"international_phonetic_alphabet_transliterate","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","rouge_t5","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"intersect_geometry","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"intersect_geometry","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"irony_identification","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"irony_identification","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"iwslt17:ar-en","suite":["lighteval","harness_selection"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_ar-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"iwslt17:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"iwslt17:en-ar","suite":["lighteval","harness_selection"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_ar-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"iwslt17:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"iwslt17:en-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_en-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"iwslt17:en-ja","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_en-ja","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"iwslt17:en-ko","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_en-ko","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"iwslt17:en-zh","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_en-zh","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"iwslt17:fr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_fr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"iwslt17:ja-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_ja-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"iwslt17:ko-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_ko-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"iwslt17:zh-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_zh-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"kanji_ascii","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"kanji_ascii","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"kannada","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"kannada","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"key_value_maps","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"key_value_maps","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"known_unknowns","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_whitespace_after_query","hf_repo":"bigbench","hf_subset":"known_unknowns","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lambada:standard","suite":["lighteval","lambada"],"prompt_function":"lambada","hf_repo":"lambada","hf_subset":"plain_text","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["target_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lambada:standard_cloze","suite":["lighteval","lambada"],"prompt_function":"lambada_cloze","hf_repo":"lambada","hf_subset":"plain_text","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["target_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lambada:openai","suite":["lighteval","lambada"],"prompt_function":"lambada","hf_repo":"EleutherAI\/lambada_openai","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["target_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lambada:openai:de","suite":["lighteval","lambada"],"prompt_function":"lambada","hf_repo":"EleutherAI\/lambada_openai","hf_subset":"de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["target_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lambada:openai:en","suite":["lighteval","lambada"],"prompt_function":"lambada","hf_repo":"EleutherAI\/lambada_openai","hf_subset":"en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["target_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lambada:openai:es","suite":["lighteval","lambada"],"prompt_function":"lambada","hf_repo":"EleutherAI\/lambada_openai","hf_subset":"es","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["target_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lambada:openai:fr","suite":["lighteval","lambada"],"prompt_function":"lambada","hf_repo":"EleutherAI\/lambada_openai","hf_subset":"fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["target_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lambada:openai:it","suite":["lighteval","lambada"],"prompt_function":"lambada","hf_repo":"EleutherAI\/lambada_openai","hf_subset":"it","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["target_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lambada:openai_cloze","suite":["lighteval","lambada"],"prompt_function":"lambada_cloze","hf_repo":"EleutherAI\/lambada_openai","hf_subset":"en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["target_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"language_games","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"language_games","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","rouge_t5","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"language_identification","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"language_identification","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"legal_summarization:billsum","suite":["helm"],"prompt_function":"legal_summarization","hf_repo":"lighteval\/legal_summarization","hf_subset":"BillSum","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":1024,"metric":["rouge1","rouge2","rougeL","faithfulness","extractiveness","bert_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"legal_summarization:eurlexsum","suite":["helm"],"prompt_function":"legal_summarization","hf_repo":"lighteval\/legal_summarization","hf_subset":"EurLexSum","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["rouge1","rouge2","rougeL","faithfulness","extractiveness","bert_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"legal_summarization:multilexsum","suite":["helm"],"prompt_function":"multilexsum","hf_repo":"lighteval\/legal_summarization","hf_subset":"MultiLexSum","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":256,"metric":["rouge1","rouge2","rougeL","faithfulness","extractiveness","bert_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"legalsupport","suite":["helm"],"prompt_function":"legal_support","hf_repo":"lighteval\/LegalSupport","hf_subset":"default","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["loglikelihood_acc","exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lexglue:case_hold","suite":["helm","lex_glue_scenario"],"prompt_function":"lex_glue_case_hold","hf_repo":"lighteval\/lexglue","hf_subset":"case_hold","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lexglue:ecthr_a","suite":["helm","lex_glue_scenario"],"prompt_function":"lex_glue_ecthr_a","hf_repo":"lighteval\/lexglue","hf_subset":"ecthr_a","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lexglue:ecthr_b","suite":["helm","lex_glue_scenario"],"prompt_function":"lex_glue_ecthr_b","hf_repo":"lighteval\/lexglue","hf_subset":"ecthr_b","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lexglue:eurlex","suite":["helm","lex_glue_scenario"],"prompt_function":"lex_glue_eurlex","hf_repo":"lighteval\/lexglue","hf_subset":"eurlex","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lexglue:ledgar","suite":["helm","lex_glue_scenario"],"prompt_function":"lex_glue_ledgar","hf_repo":"lighteval\/lexglue","hf_subset":"ledgar","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lexglue:scotus","suite":["helm","lex_glue_scenario"],"prompt_function":"lex_glue_scotus","hf_repo":"lighteval\/lexglue","hf_subset":"scotus","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lexglue:unfair_tos","suite":["helm","lex_glue_scenario"],"prompt_function":"lex_glue_unfair_tos","hf_repo":"lighteval\/lexglue","hf_subset":"unfair_tos","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lextreme:brazilian_court_decisions_judgment","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_brazilian_court_decisions_judgment","hf_repo":"lighteval\/lextreme","hf_subset":"brazilian_court_decisions_judgment","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lextreme:brazilian_court_decisions_unanimity","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_brazilian_court_decisions_unanimity","hf_repo":"lighteval\/lextreme","hf_subset":"brazilian_court_decisions_unanimity","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lextreme:covid19_emergency_event","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_covid19_emergency_event","hf_repo":"lighteval\/lextreme","hf_subset":"covid19_emergency_event","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lextreme:german_argument_mining","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_german_argument_mining","hf_repo":"lighteval\/lextreme","hf_subset":"german_argument_mining","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lextreme:greek_legal_code_chapter","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_greek_legal_code_chapter","hf_repo":"lighteval\/lextreme","hf_subset":"greek_legal_code_chapter","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lextreme:greek_legal_code_subject","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_greek_legal_code_subject","hf_repo":"lighteval\/lextreme","hf_subset":"greek_legal_code_subject","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lextreme:greek_legal_code_volume","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_greek_legal_code_volume","hf_repo":"lighteval\/lextreme","hf_subset":"greek_legal_code_volume","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","f1_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lextreme:greek_legal_ner","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_greek_legal_ner","hf_repo":"lighteval\/lextreme","hf_subset":"greek_legal_ner","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":430,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lextreme:legalnero","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_legalnero","hf_repo":"lighteval\/lextreme","hf_subset":"legalnero","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":788,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lextreme:lener_br","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_lener_br","hf_repo":"lighteval\/lextreme","hf_subset":"lener_br","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":338,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lextreme:mapa_coarse","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_mapa_coarse","hf_repo":"lighteval\/lextreme","hf_subset":"mapa_coarse","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":274,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lextreme:mapa_fine","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_mapa_fine","hf_repo":"lighteval\/lextreme","hf_subset":"mapa_fine","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":274,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lextreme:multi_eurlex_level_1","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_multi_eurlex_level_1","hf_repo":"lighteval\/lextreme","hf_subset":"multi_eurlex_level_1","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lextreme:multi_eurlex_level_2","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_multi_eurlex_level_2","hf_repo":"lighteval\/lextreme","hf_subset":"multi_eurlex_level_2","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lextreme:multi_eurlex_level_3","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_multi_eurlex_level_3","hf_repo":"lighteval\/lextreme","hf_subset":"multi_eurlex_level_3","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lextreme:online_terms_of_service_clause_topics","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_online_terms_of_service_clause_topics","hf_repo":"lighteval\/lextreme","hf_subset":"online_terms_of_service_clause_topics","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lextreme:online_terms_of_service_unfairness_levels","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_online_terms_of_service_unfairness_levels","hf_repo":"lighteval\/lextreme","hf_subset":"online_terms_of_service_unfairness_levels","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lextreme:swiss_judgment_prediction","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_swiss_judgment_prediction","hf_repo":"lighteval\/lextreme","hf_subset":"swiss_judgment_prediction","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"linguistic_mappings","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"linguistic_mappings","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"linguistics_puzzles","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_whitespace_after_query","hf_repo":"bigbench","hf_subset":"linguistics_puzzles","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["bleu","rouge_t5","perfect_exact_match"],"stop_sequence":null,"output_regex":"[^\\.\\?\\!\\;\\n]+", "trust_dataset": true,"version":0} -{"name":"logic_grid_puzzle","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"logic_grid_puzzle","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"logical_args","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"logical_args","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"logical_deduction","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_whitespace_after_query","hf_repo":"bigbench","hf_subset":"logical_deduction","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"logical_fallacy_detection","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"logical_fallacy_detection","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"logical_sequence","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"logical_sequence","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"logiqa","suite":["lighteval"],"prompt_function":"logiqa","hf_repo":"lighteval/logiqa_harness","hf_subset":"logiqa","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lsat_qa","suite":["helm","lsat_qa_scenario"],"prompt_function":"lsat_qa","hf_repo":"lighteval\/lsat_qa","hf_subset":"all","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lsat_qa:assignment","suite":["helm","lsat_qa_scenario"],"prompt_function":"lsat_qa","hf_repo":"lighteval\/lsat_qa","hf_subset":"assignment","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lsat_qa:grouping","suite":["helm","lsat_qa_scenario"],"prompt_function":"lsat_qa","hf_repo":"lighteval\/lsat_qa","hf_subset":"grouping","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lsat_qa:miscellaneous","suite":["helm","lsat_qa_scenario"],"prompt_function":"lsat_qa","hf_repo":"lighteval\/lsat_qa","hf_subset":"miscellaneous","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lsat_qa:ordering","suite":["helm","lsat_qa_scenario"],"prompt_function":"lsat_qa","hf_repo":"lighteval\/lsat_qa","hf_subset":"ordering","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"math:algebra","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"algebra","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":1} -{"name":"math:counting_and_probability","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"counting_and_probability","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":1} -{"name":"math:geometry","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"geometry","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":1} -{"name":"math:intermediate_algebra","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"intermediate_algebra","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":1} -{"name":"math:number_theory","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"number_theory","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":1} -{"name":"math:prealgebra","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"prealgebra","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":1} -{"name":"math:precalculus","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"precalculus","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":1} -{"name":"math_cot:algebra","suite":["lighteval","math"],"prompt_function":"math_cot","hf_repo":"lighteval\/MATH","hf_subset":"algebra","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"math_cot:counting_and_probability","suite":["lighteval","math"],"prompt_function":"math_cot","hf_repo":"lighteval\/MATH","hf_subset":"counting_and_probability","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"math_cot:geometry","suite":["lighteval","math"],"prompt_function":"math_cot","hf_repo":"lighteval\/MATH","hf_subset":"geometry","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"math_cot:intermediate_algebra","suite":["lighteval","math"],"prompt_function":"math_cot","hf_repo":"lighteval\/MATH","hf_subset":"intermediate_algebra","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"math_cot:number_theory","suite":["lighteval","math"],"prompt_function":"math_cot","hf_repo":"lighteval\/MATH","hf_subset":"number_theory","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"math_cot:prealgebra","suite":["lighteval","math"],"prompt_function":"math_cot","hf_repo":"lighteval\/MATH","hf_subset":"prealgebra","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"math_cot:precalculus","suite":["lighteval","math"],"prompt_function":"math_cot","hf_repo":"lighteval\/MATH","hf_subset":"precalculus","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mathematical_induction","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"mathematical_induction","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mathqa","suite":["lighteval"],"prompt_function":"mathqa","hf_repo":"math_qa","hf_subset":"default","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"matrixshapes","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"matrixshapes","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"me_q_sum","suite":["helm"],"prompt_function":"me_q_sum","hf_repo":"lighteval\/me_q_sum","hf_subset":"default","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":128,"metric":["exact_match","quasi_exact_match","f1_score","rougeL","bleu_1","bleu_4"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"med_dialog:healthcaremagic","suite":["helm"],"prompt_function":"med_dialog","hf_repo":"lighteval\/med_dialog","hf_subset":"healthcaremagic","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":128,"metric":["exact_match","quasi_exact_match","f1_score","rougeL","bleu_1","bleu_4"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"med_dialog:icliniq","suite":["helm"],"prompt_function":"med_dialog","hf_repo":"lighteval\/med_dialog","hf_subset":"icliniq","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":128,"metric":["exact_match","quasi_exact_match","f1_score","rougeL","bleu_1","bleu_4"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"med_mcqa","suite":["helm"],"prompt_function":"med_mcqa","hf_repo":"lighteval\/med_mcqa","hf_subset":"default","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["loglikelihood_acc","exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"med_paragraph_simplification","suite":["helm"],"prompt_function":"med_paragraph_simplification","hf_repo":"lighteval\/med_paragraph_simplification","hf_subset":"default","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":512,"metric":["exact_match","quasi_exact_match","f1_score","rougeL","bleu_1","bleu_4"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"med_qa","suite":["helm"],"prompt_function":"med_qa","hf_repo":"bigbio\/med_qa","hf_subset":"med_qa_en_source","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["loglikelihood_acc","exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"metaphor_boolean","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"metaphor_boolean","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"metaphor_understanding","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"metaphor_understanding","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mgsm:en","suite":["lighteval"],"prompt_function":"mgsm_en","hf_repo":"juletxara/mgsm","hf_subset":"en","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match", "quasi_exact_match"],"stop_sequence":["\n", ":", "Question:"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mgsm:es","suite":["lighteval"],"prompt_function":"mgsm_es","hf_repo":"juletxara/mgsm","hf_subset":"es","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match", "quasi_exact_match"],"stop_sequence":["\n", ":", "Pregunta:"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mgsm:fr","suite":["lighteval"],"prompt_function":"mgsm_fr","hf_repo":"juletxara/mgsm","hf_subset":"fr","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match", "quasi_exact_match"],"stop_sequence":["\n", ":", "Question:"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mgsm:de","suite":["lighteval"],"prompt_function":"mgsm_de","hf_repo":"juletxara/mgsm","hf_subset":"de","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match", "quasi_exact_match"],"stop_sequence":["\n", ":", "Frage:"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mgsm:ru","suite":["lighteval"],"prompt_function":"mgsm_ru","hf_repo":"juletxara/mgsm","hf_subset":"ru","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match", "quasi_exact_match"],"stop_sequence":["\n", ":", "\u0417\u0430\u0434\u0430\u0447\u0430:"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mgsm:zh","suite":["lighteval"],"prompt_function":"mgsm_zh","hf_repo":"juletxara/mgsm","hf_subset":"zh","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match", "quasi_exact_match"],"stop_sequence":["\n", ":", "\u95ee\u9898:"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mgsm:ja","suite":["lighteval"],"prompt_function":"mgsm_ja","hf_repo":"juletxara/mgsm","hf_subset":"ja","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match", "quasi_exact_match"],"stop_sequence":["\n", ":", "\u554f\u984c:"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mgsm:th","suite":["lighteval"],"prompt_function":"mgsm_th","hf_repo":"juletxara/mgsm","hf_subset":"th","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match", "quasi_exact_match"],"stop_sequence":["\n", ":", "\u0e42\u0e08\u0e17\u0e22\u0e4c:"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mgsm:sw","suite":["lighteval"],"prompt_function":"mgsm_sw","hf_repo":"juletxara/mgsm","hf_subset":"sw","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match", "quasi_exact_match"],"stop_sequence":["\n", ":", "Swali:"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mgsm:bn","suite":["lighteval"],"prompt_function":"mgsm_bn","hf_repo":"juletxara/mgsm","hf_subset":"bn","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match", "quasi_exact_match"],"stop_sequence":["\n", ":", "\u09aa\u09cd\u09b0\u09b6\u09cd\u09a8:"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mgsm:te","suite":["lighteval"],"prompt_function":"mgsm_te","hf_repo":"juletxara/mgsm","hf_subset":"te","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match", "quasi_exact_match"],"stop_sequence":["\n", ":", "\u0c2a\u0c4d\u0c30\u0c36\u0c4d\u0c28:"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"minute_mysteries_qa","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"minute_mysteries_qa","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","rouge_t5"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"misconceptions","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"misconceptions","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"misconceptions_russian","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"misconceptions_russian","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"all","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu","suite":["original"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"all","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":5,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:abstract_algebra","suite":["original","mmlu"],"prompt_function":"mmlu_abstract_algebra","hf_repo":"cais\/mmlu","hf_subset":"abstract_algebra","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:abstract_algebra","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"abstract_algebra","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:abstract_algebra","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"abstract_algebra","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:anatomy","suite":["original","mmlu"],"prompt_function":"mmlu_anatomy","hf_repo":"cais\/mmlu","hf_subset":"anatomy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:anatomy","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"anatomy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:anatomy","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"anatomy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:astronomy","suite":["original","mmlu"],"prompt_function":"mmlu_astronomy","hf_repo":"cais\/mmlu","hf_subset":"astronomy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:astronomy","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"astronomy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:astronomy","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"astronomy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:business_ethics","suite":["original","mmlu"],"prompt_function":"mmlu_business_ethics","hf_repo":"cais\/mmlu","hf_subset":"business_ethics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:business_ethics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"business_ethics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:business_ethics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"business_ethics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:clinical_knowledge","suite":["original","mmlu"],"prompt_function":"mmlu_clinical_knowledge","hf_repo":"cais\/mmlu","hf_subset":"clinical_knowledge","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:clinical_knowledge","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"clinical_knowledge","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:clinical_knowledge","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"clinical_knowledge","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:college_biology","suite":["original","mmlu"],"prompt_function":"mmlu_college_biology","hf_repo":"cais\/mmlu","hf_subset":"college_biology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:college_biology","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"college_biology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:college_biology","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"college_biology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:college_chemistry","suite":["original","mmlu"],"prompt_function":"mmlu_college_chemistry","hf_repo":"cais\/mmlu","hf_subset":"college_chemistry","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:college_chemistry","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"college_chemistry","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:college_chemistry","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"college_chemistry","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:college_computer_science","suite":["original","mmlu"],"prompt_function":"mmlu_college_computer_science","hf_repo":"cais\/mmlu","hf_subset":"college_computer_science","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:college_computer_science","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"college_computer_science","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:college_computer_science","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"college_computer_science","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:college_mathematics","suite":["original","mmlu"],"prompt_function":"mmlu_college_mathematics","hf_repo":"cais\/mmlu","hf_subset":"college_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:college_mathematics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"college_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:college_mathematics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"college_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:college_medicine","suite":["original","mmlu"],"prompt_function":"mmlu_college_medicine","hf_repo":"cais\/mmlu","hf_subset":"college_medicine","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:college_medicine","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"college_medicine","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:college_medicine","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"college_medicine","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:college_physics","suite":["original","mmlu"],"prompt_function":"mmlu_college_physics","hf_repo":"cais\/mmlu","hf_subset":"college_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:college_physics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"college_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:college_physics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"college_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:computer_security","suite":["original","mmlu"],"prompt_function":"mmlu_computer_security","hf_repo":"cais\/mmlu","hf_subset":"computer_security","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:computer_security","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"computer_security","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:computer_security","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"computer_security","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:conceptual_physics","suite":["original","mmlu"],"prompt_function":"mmlu_conceptual_physics","hf_repo":"cais\/mmlu","hf_subset":"conceptual_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:conceptual_physics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"conceptual_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:conceptual_physics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"conceptual_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:econometrics","suite":["original","mmlu"],"prompt_function":"mmlu_econometrics","hf_repo":"cais\/mmlu","hf_subset":"econometrics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:econometrics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"econometrics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:econometrics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"econometrics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:electrical_engineering","suite":["original","mmlu"],"prompt_function":"mmlu_electrical_engineering","hf_repo":"cais\/mmlu","hf_subset":"electrical_engineering","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:electrical_engineering","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"electrical_engineering","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:electrical_engineering","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"electrical_engineering","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:elementary_mathematics","suite":["original","mmlu"],"prompt_function":"mmlu_elementary_mathematics","hf_repo":"cais\/mmlu","hf_subset":"elementary_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:elementary_mathematics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"elementary_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:elementary_mathematics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"elementary_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:formal_logic","suite":["original","mmlu"],"prompt_function":"mmlu_formal_logic","hf_repo":"cais\/mmlu","hf_subset":"formal_logic","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:formal_logic","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"formal_logic","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:formal_logic","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"formal_logic","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:global_facts","suite":["original","mmlu"],"prompt_function":"mmlu_global_facts","hf_repo":"cais\/mmlu","hf_subset":"global_facts","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:global_facts","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"global_facts","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:global_facts","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"global_facts","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_biology","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_biology","hf_repo":"cais\/mmlu","hf_subset":"high_school_biology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_biology","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_biology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_biology","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_biology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_chemistry","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_chemistry","hf_repo":"cais\/mmlu","hf_subset":"high_school_chemistry","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_chemistry","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_chemistry","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_chemistry","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_chemistry","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_computer_science","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_computer_science","hf_repo":"cais\/mmlu","hf_subset":"high_school_computer_science","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_computer_science","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_computer_science","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_computer_science","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_computer_science","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_european_history","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_european_history","hf_repo":"cais\/mmlu","hf_subset":"high_school_european_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_european_history","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_european_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_european_history","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_european_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_geography","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_geography","hf_repo":"cais\/mmlu","hf_subset":"high_school_geography","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_geography","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_geography","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_geography","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_geography","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_government_and_politics","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_government_and_politics","hf_repo":"cais\/mmlu","hf_subset":"high_school_government_and_politics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_government_and_politics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_government_and_politics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_government_and_politics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_government_and_politics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_macroeconomics","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_macroeconomics","hf_repo":"cais\/mmlu","hf_subset":"high_school_macroeconomics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_macroeconomics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_macroeconomics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_macroeconomics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_macroeconomics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_mathematics","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_mathematics","hf_repo":"cais\/mmlu","hf_subset":"high_school_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_mathematics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_mathematics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_microeconomics","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_microeconomics","hf_repo":"cais\/mmlu","hf_subset":"high_school_microeconomics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_microeconomics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_microeconomics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_microeconomics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_microeconomics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_physics","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_physics","hf_repo":"cais\/mmlu","hf_subset":"high_school_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_physics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_physics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_psychology","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_psychology","hf_repo":"cais\/mmlu","hf_subset":"high_school_psychology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_psychology","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_psychology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_psychology","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_psychology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_statistics","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_statistics","hf_repo":"cais\/mmlu","hf_subset":"high_school_statistics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_statistics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_statistics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_statistics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_statistics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_us_history","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_us_history","hf_repo":"cais\/mmlu","hf_subset":"high_school_us_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_us_history","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_us_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_us_history","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_us_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_world_history","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_world_history","hf_repo":"cais\/mmlu","hf_subset":"high_school_world_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_world_history","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_world_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_world_history","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_world_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:human_aging","suite":["original","mmlu"],"prompt_function":"mmlu_human_aging","hf_repo":"cais\/mmlu","hf_subset":"human_aging","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:human_aging","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"human_aging","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:human_aging","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"human_aging","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:human_sexuality","suite":["original","mmlu"],"prompt_function":"mmlu_human_sexuality","hf_repo":"cais\/mmlu","hf_subset":"human_sexuality","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:human_sexuality","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"human_sexuality","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:human_sexuality","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"human_sexuality","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:international_law","suite":["original","mmlu"],"prompt_function":"mmlu_international_law","hf_repo":"cais\/mmlu","hf_subset":"international_law","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:international_law","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"international_law","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:international_law","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"international_law","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:jurisprudence","suite":["original","mmlu"],"prompt_function":"mmlu_jurisprudence","hf_repo":"cais\/mmlu","hf_subset":"jurisprudence","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:jurisprudence","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"jurisprudence","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:jurisprudence","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"jurisprudence","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:logical_fallacies","suite":["original","mmlu"],"prompt_function":"mmlu_logical_fallacies","hf_repo":"cais\/mmlu","hf_subset":"logical_fallacies","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:logical_fallacies","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"logical_fallacies","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:logical_fallacies","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"logical_fallacies","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:machine_learning","suite":["original","mmlu"],"prompt_function":"mmlu_machine_learning","hf_repo":"cais\/mmlu","hf_subset":"machine_learning","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:machine_learning","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"machine_learning","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:machine_learning","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"machine_learning","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:management","suite":["original","mmlu"],"prompt_function":"mmlu_management","hf_repo":"cais\/mmlu","hf_subset":"management","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:management","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"management","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:management","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"management","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:marketing","suite":["original","mmlu"],"prompt_function":"mmlu_marketing","hf_repo":"cais\/mmlu","hf_subset":"marketing","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:marketing","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"marketing","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:marketing","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"marketing","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:medical_genetics","suite":["original","mmlu"],"prompt_function":"mmlu_medical_genetics","hf_repo":"cais\/mmlu","hf_subset":"medical_genetics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:medical_genetics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"medical_genetics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:medical_genetics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"medical_genetics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:miscellaneous","suite":["original","mmlu"],"prompt_function":"mmlu_miscellaneous","hf_repo":"cais\/mmlu","hf_subset":"miscellaneous","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:miscellaneous","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"miscellaneous","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:miscellaneous","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"miscellaneous","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:moral_disputes","suite":["original","mmlu"],"prompt_function":"mmlu_moral_disputes","hf_repo":"cais\/mmlu","hf_subset":"moral_disputes","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:moral_disputes","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"moral_disputes","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:moral_disputes","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"moral_disputes","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:moral_scenarios","suite":["original","mmlu"],"prompt_function":"mmlu_moral_scenarios","hf_repo":"cais\/mmlu","hf_subset":"moral_scenarios","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:moral_scenarios","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"moral_scenarios","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:moral_scenarios","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"moral_scenarios","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:nutrition","suite":["original","mmlu"],"prompt_function":"mmlu_nutrition","hf_repo":"cais\/mmlu","hf_subset":"nutrition","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:nutrition","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"nutrition","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:nutrition","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"nutrition","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:philosophy","suite":["original","mmlu"],"prompt_function":"mmlu_philosophy","hf_repo":"cais\/mmlu","hf_subset":"philosophy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:philosophy","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"philosophy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:philosophy","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"philosophy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:prehistory","suite":["original","mmlu"],"prompt_function":"mmlu_prehistory","hf_repo":"cais\/mmlu","hf_subset":"prehistory","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:prehistory","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"prehistory","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:prehistory","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"prehistory","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:professional_accounting","suite":["original","mmlu"],"prompt_function":"mmlu_professional_accounting","hf_repo":"cais\/mmlu","hf_subset":"professional_accounting","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:professional_accounting","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"professional_accounting","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:professional_accounting","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"professional_accounting","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:professional_law","suite":["original","mmlu"],"prompt_function":"mmlu_professional_law","hf_repo":"cais\/mmlu","hf_subset":"professional_law","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:professional_law","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"professional_law","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:professional_law","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"professional_law","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:professional_medicine","suite":["original","mmlu"],"prompt_function":"mmlu_professional_medicine","hf_repo":"cais\/mmlu","hf_subset":"professional_medicine","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:professional_medicine","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"professional_medicine","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:professional_medicine","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"professional_medicine","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:professional_psychology","suite":["original","mmlu"],"prompt_function":"mmlu_professional_psychology","hf_repo":"cais\/mmlu","hf_subset":"professional_psychology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:professional_psychology","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"professional_psychology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:professional_psychology","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"professional_psychology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:public_relations","suite":["original","mmlu"],"prompt_function":"mmlu_public_relations","hf_repo":"cais\/mmlu","hf_subset":"public_relations","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:public_relations","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"public_relations","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:public_relations","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"public_relations","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:security_studies","suite":["original","mmlu"],"prompt_function":"mmlu_security_studies","hf_repo":"cais\/mmlu","hf_subset":"security_studies","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:security_studies","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"security_studies","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:security_studies","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"security_studies","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:sociology","suite":["original","mmlu"],"prompt_function":"mmlu_sociology","hf_repo":"cais\/mmlu","hf_subset":"sociology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:sociology","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"sociology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:sociology","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"sociology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:us_foreign_policy","suite":["original","mmlu"],"prompt_function":"mmlu_us_foreign_policy","hf_repo":"cais\/mmlu","hf_subset":"us_foreign_policy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:us_foreign_policy","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"us_foreign_policy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:us_foreign_policy","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"us_foreign_policy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:virology","suite":["original","mmlu"],"prompt_function":"mmlu_virology","hf_repo":"cais\/mmlu","hf_subset":"virology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:virology","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"virology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:virology","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"virology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:world_religions","suite":["original","mmlu"],"prompt_function":"mmlu_world_religions","hf_repo":"cais\/mmlu","hf_subset":"world_religions","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:world_religions","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"world_religions","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:world_religions","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"world_religions","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mnist_ascii","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"mnist_ascii","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"modified_arithmetic","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"modified_arithmetic","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"moral_permissibility","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"moral_permissibility","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"movie_dialog_same_or_different","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"movie_dialog_same_or_different","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"movie_recommendation","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"movie_recommendation","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mtnt2019:en-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"mtnt2019_en-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":200,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mtnt2019:en-ja","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"mtnt2019_en-ja","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":200,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mtnt2019:fr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"mtnt2019_fr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":200,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mtnt2019:ja-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"mtnt2019_ja-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":200,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mult_data_wrangling","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"mult_data_wrangling","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"multiemo","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"multiemo","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mutual","suite":["lighteval"],"prompt_function":"mutual","hf_repo":"lighteval\/mutual_harness","hf_subset":"mutual","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["recall_at_1","recall_at_2","mrr"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mutual_plus","suite":["lighteval"],"prompt_function":"mutual","hf_repo":"lighteval\/mutual_harness","hf_subset":"mutual_plus","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["recall_at_1","recall_at_2","mrr"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"narrativeqa","suite":["helm","helm_general"],"prompt_function":"narrativeqa","hf_repo":"lighteval/narrative_qa_helm","hf_subset":"default","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","f1_score","rougeL","bleu_1","bleu_4"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"natural_instructions","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"natural_instructions","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","rouge_t5"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"navigate","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"navigate","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"nonsense_words_grammar","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"nonsense_words_grammar","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"novel_concepts","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"novel_concepts","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"numeracy:linear_example","suite":["helm"],"prompt_function":"numeracy","hf_repo":"lighteval\/numeracy","hf_subset":"linear_example","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"numeracy:linear_standard","suite":["helm"],"prompt_function":"numeracy","hf_repo":"lighteval\/numeracy","hf_subset":"linear_standard","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"numeracy:parabola_example","suite":["helm"],"prompt_function":"numeracy","hf_repo":"lighteval\/numeracy","hf_subset":"parabola_example","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"numeracy:parabola_standard","suite":["helm"],"prompt_function":"numeracy","hf_repo":"lighteval\/numeracy","hf_subset":"parabola_standard","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"numeracy:paraboloid_example","suite":["helm"],"prompt_function":"numeracy","hf_repo":"lighteval\/numeracy","hf_subset":"paraboloid_example","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"numeracy:paraboloid_standard","suite":["helm"],"prompt_function":"numeracy","hf_repo":"lighteval\/numeracy","hf_subset":"paraboloid_standard","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"numeracy:plane_example","suite":["helm"],"prompt_function":"numeracy","hf_repo":"lighteval\/numeracy","hf_subset":"plane_example","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"numeracy:plane_standard","suite":["helm"],"prompt_function":"numeracy","hf_repo":"lighteval\/numeracy","hf_subset":"plane_standard","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"object_counting","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"object_counting","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"odd_one_out","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"odd_one_out","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"openbookqa","suite":["helm","commonsense_scenario","helm_general"],"prompt_function":"openbookqa_helm","hf_repo":"openbookqa","hf_subset":"main","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"openbookqa","suite":["lighteval"],"prompt_function":"openbookqa","hf_repo":"openbookqa","hf_subset":"main","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"operators","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_whitespace_after_query","hf_repo":"bigbench","hf_subset":"operators","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":"([-+]?\\d+)[.]{0,1}$", "trust_dataset": true,"version":0} -{"name":"paragraph_segmentation","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"paragraph_segmentation","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"parsinlu_qa","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"parsinlu_qa","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"parsinlu_reading_comprehension","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_whitespace_after_query","hf_repo":"bigbench","hf_subset":"parsinlu_reading_comprehension","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["perfect_exact_match"],"stop_sequence":null,"output_regex":"[^\\.\\?\\!\\;\\n]+", "trust_dataset": true,"version":0} -{"name":"penguins_in_a_table","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"penguins_in_a_table","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"periodic_elements","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"periodic_elements","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"persian_idioms","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"persian_idioms","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"phrase_relatedness","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"phrase_relatedness","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"physical_intuition","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"physical_intuition","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"physics","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"physics","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"physics_questions","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"physics_questions","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"piqa","suite":["lighteval"],"prompt_function":"piqa_harness","hf_repo":"piqa","hf_subset":"plain_text","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"piqa","suite":["helm","commonsense_scenario"],"prompt_function":"piqa_helm","hf_repo":"piqa","hf_subset":"plain_text","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"play_dialog_same_or_different","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_whitespace_after_query","hf_repo":"bigbench","hf_subset":"play_dialog_same_or_different","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"polish_sequence_labeling","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"polish_sequence_labeling","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["f1_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"presuppositions_as_nli","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"presuppositions_as_nli","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"prost","suite":["lighteval"],"prompt_function":"prost","hf_repo":"corypaik\/prost","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"pubmedqa","suite":["lighteval"],"prompt_function":"pubmed_qa","hf_repo":"pubmed_qa","hf_subset":"pqa_labeled","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"pubmedqa","suite":["helm"],"prompt_function":"pubmed_qa_helm","hf_repo":"pubmed_qa","hf_subset":"pqa_labeled","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"qa4mre:2011","suite":["lighteval"],"prompt_function":"qa4mre","hf_repo":"qa4mre","hf_subset":"2011.main.EN","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"qa4mre:2012","suite":["lighteval"],"prompt_function":"qa4mre","hf_repo":"qa4mre","hf_subset":"2012.main.EN","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"qa4mre:2013","suite":["lighteval"],"prompt_function":"qa4mre","hf_repo":"qa4mre","hf_subset":"2013.main.EN","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"qa_wikidata","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"qa_wikidata","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleurt","bleu","rouge_t5","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"qasper","suite":["lighteval"],"prompt_function":"qasper","hf_repo":"qasper","hf_subset":"qasper","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["f1_score_quasi"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"qasper_ll","suite":["lighteval"],"prompt_function":"qasper_ll","hf_repo":"qasper","hf_subset":"qasper","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"quac","suite":["helm"],"prompt_function":"quac","hf_repo":"lighteval/quac_helm","hf_subset":"default","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match", "quasi_exact_match", "f1_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"question_selection","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"question_selection","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"race:high","suite":["lighteval","race"],"prompt_function":"race","hf_repo":"EleutherAI/race","hf_subset":"high","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"raft:ade_corpus_v2","suite":["helm","helm_general"],"prompt_function":"raft_ade_corpus_v2","hf_repo":"ought\/raft","hf_subset":"ade_corpus_v2","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":30,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"raft:banking_77","suite":["helm","helm_general"],"prompt_function":"raft_banking_77","hf_repo":"ought\/raft","hf_subset":"banking_77","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":30,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"raft:neurips_impact_statement_risks","suite":["helm","helm_general"],"prompt_function":"raft_neurips_impact_statement_risks","hf_repo":"ought\/raft","hf_subset":"neurips_impact_statement_risks","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":30,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"raft:one_stop_english","suite":["helm","helm_general"],"prompt_function":"raft_one_stop_english","hf_repo":"ought\/raft","hf_subset":"one_stop_english","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":30,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"raft:overruling","suite":["helm","helm_general"],"prompt_function":"raft_overruling","hf_repo":"ought\/raft","hf_subset":"overruling","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":30,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"raft:semiconductor_org_types","suite":["helm","helm_general"],"prompt_function":"raft_semiconductor_org_types","hf_repo":"ought\/raft","hf_subset":"semiconductor_org_types","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":30,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"raft:systematic_review_inclusion","suite":["helm","helm_general"],"prompt_function":"raft_systematic_review_inclusion","hf_repo":"ought\/raft","hf_subset":"systematic_review_inclusion","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":30,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"raft:tai_safety_research","suite":["helm","helm_general"],"prompt_function":"raft_tai_safety_research","hf_repo":"ought\/raft","hf_subset":"tai_safety_research","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":30,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"raft:terms_of_service","suite":["helm","helm_general"],"prompt_function":"raft_terms_of_service","hf_repo":"ought\/raft","hf_subset":"terms_of_service","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":30,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"raft:tweet_eval_hate","suite":["helm","helm_general"],"prompt_function":"raft_tweet_eval_hate","hf_repo":"ought\/raft","hf_subset":"tweet_eval_hate","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":30,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"raft:twitter_complaints","suite":["helm","helm_general"],"prompt_function":"raft_twitter_complaints","hf_repo":"ought\/raft","hf_subset":"twitter_complaints","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":30,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"real_or_fake_text","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"real_or_fake_text","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"real_toxicity_prompts","suite":["helm"],"prompt_function":"real_toxicity_prompts","hf_repo":"allenai\/real-toxicity-prompts","hf_subset":"default","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["prediction_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"reasoning_about_colored_objects","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"reasoning_about_colored_objects","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"repeat_copy_logic","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_whitespace_after_query","hf_repo":"bigbench","hf_subset":"repeat_copy_logic","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"rephrase","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"rephrase","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["rouge_t5","bleu","loglikelihood_acc","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"rhyming","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"rhyming","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"riddle_sense","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"riddle_sense","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"ruin_names","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"ruin_names","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"salient_translation_error_detection","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"salient_translation_error_detection","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"scientific_press_release","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"scientific_press_release","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"sciq","suite":["lighteval"],"prompt_function":"sciq","hf_repo":"sciq","hf_subset":"default","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"semantic_parsing_in_context_sparc","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"semantic_parsing_in_context_sparc","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","rouge_t5","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"semantic_parsing_spider","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"semantic_parsing_spider","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","rouge_t5","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"sentence_ambiguity","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"sentence_ambiguity","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"similarities_abstraction","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"similarities_abstraction","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","rouge_t5","loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"simp_turing_concept","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"simp_turing_concept","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"simple_arithmetic_json","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"simple_arithmetic_json","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"simple_arithmetic_json_multiple_choice","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"simple_arithmetic_json_multiple_choice","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"simple_arithmetic_json_subtasks","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"simple_arithmetic_json_subtasks","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"simple_arithmetic_multiple_targets_json","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"simple_arithmetic_multiple_targets_json","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"simple_ethical_questions","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"simple_ethical_questions","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"simple_text_editing","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"simple_text_editing","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"siqa","suite":["helm","commonsense_scenario"],"prompt_function":"siqa","hf_repo":"social_i_qa","hf_subset":"default","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"snarks","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"snarks","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"social_iqa","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"social_iqa","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"social_support","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"social_support","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["f1_score_macro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"sports_understanding","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"sports_understanding","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"storycloze:2016","suite":["lighteval","storycloze"],"prompt_function":"storycloze","hf_repo":"story_cloze","hf_subset":"2016","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"storycloze:2018","suite":["lighteval","storycloze"],"prompt_function":"storycloze","hf_repo":"story_cloze","hf_subset":"2018","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"strange_stories","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_whitespace_after_query","hf_repo":"bigbench","hf_subset":"strange_stories","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"strategyqa","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_whitespace_after_query","hf_repo":"bigbench","hf_subset":"strategyqa","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","rouge_t5","loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"sufficient_information","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"sufficient_information","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"suicide_risk","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"suicide_risk","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"summarization:cnn-dm","suite":["helm","helm_general"],"prompt_function":"cnn_dm","hf_repo":"lighteval\/summarization","hf_subset":"cnn-dm","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":128,"metric":["rouge1","rouge2","rougeL","faithfulness","extractiveness","bert_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"summarization:xsum","suite":["helm","helm_general"],"prompt_function":"xsum","hf_repo":"lighteval\/summarization","hf_subset":"xsum","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":64,"metric":["rouge1","rouge2","rougeL","faithfulness","extractiveness","bert_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"summarization:xsum-sampled","suite":["helm"],"prompt_function":"xsum","hf_repo":"lighteval\/summarization","hf_subset":"xsum-sampled","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":64,"metric":["rouge1","rouge2","rougeL","faithfulness","extractiveness","bert_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"super_glue:boolq","suite":["lighteval","superglue"],"prompt_function":"boolq_harness","hf_repo":"super_glue","hf_subset":"boolq","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"super_glue:cb","suite":["lighteval","superglue"],"prompt_function":"cb","hf_repo":"super_glue","hf_subset":"cb","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc_single_token", "multi_f1_numeric"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"super_glue:copa","suite":["lighteval","superglue"],"prompt_function":"copa","hf_repo":"super_glue","hf_subset":"copa","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"super_glue:rte","suite":["lighteval","superglue"],"prompt_function":"rte","hf_repo":"super_glue","hf_subset":"rte","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"super_glue:multirc","suite":["lighteval","superglue"],"prompt_function":"multirc","hf_repo":"super_glue","hf_subset":"multirc","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"super_glue:wic","suite":["lighteval","superglue"],"prompt_function":"wic","hf_repo":"super_glue","hf_subset":"wic","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"super_glue:wsc","suite":["lighteval","superglue"],"prompt_function":"wsc","hf_repo":"super_glue","hf_subset":"wsc","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"swahili_english_proverbs","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"swahili_english_proverbs","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"swag","suite":["lighteval"],"prompt_function":"swag","hf_repo":"swag","hf_subset":"regular","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"swedish_to_german_proverbs","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"swedish_to_german_proverbs","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"symbol_interpretation","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_whitespace_after_query","hf_repo":"bigbench","hf_subset":"symbol_interpretation","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"synthetic_reasoning:induction","suite":["helm"],"prompt_function":"synthetic_reasoning","hf_repo":"lighteval\/synthetic_reasoning","hf_subset":"induction","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":50,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"synthetic_reasoning:natural_easy","suite":["helm"],"prompt_function":"synthetic_reasoning_natural","hf_repo":"lighteval\/synthetic_reasoning_natural","hf_subset":"easy","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","f1_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"synthetic_reasoning:natural_hard","suite":["helm"],"prompt_function":"synthetic_reasoning_natural","hf_repo":"lighteval\/synthetic_reasoning_natural","hf_subset":"hard","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","f1_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"synthetic_reasoning:pattern_match","suite":["helm"],"prompt_function":"synthetic_reasoning","hf_repo":"lighteval\/synthetic_reasoning","hf_subset":"pattern_match","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":50,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"synthetic_reasoning:variable_substitution","suite":["helm"],"prompt_function":"synthetic_reasoning","hf_repo":"lighteval\/synthetic_reasoning","hf_subset":"variable_substitution","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":50,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"tellmewhy","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"tellmewhy","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","rouge_t5"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"temporal_sequences","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"temporal_sequences","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"tense","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"tense","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:arxiv","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_arxiv","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:arxiv","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"arxiv","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:bibliotik","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"bibliotik","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:bookcorpus2","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_bookcorpus2","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:books3","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_books3","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:commoncrawl","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"commoncrawl","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:dm-mathematics","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_dm-mathematics","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:dm-mathematics","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"dm-mathematics","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:enron","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_enron","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:enron","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"enron","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:europarl","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_europarl","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:europarl","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"europarl","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:freelaw","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_freelaw","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:freelaw","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"freelaw","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:github","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_github","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:github","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"github","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:gutenberg","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_gutenberg","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:gutenberg","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"gutenberg","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:hackernews","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_hackernews","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:hackernews","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"hackernews","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:nih-exporter","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_nih-exporter","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:nih-exporter","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"nih-exporter","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:opensubtitles","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_opensubtitles","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:opensubtitles","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"opensubtitles","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:openwebtext2","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_openwebtext2","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:openwebtext2","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"openwebtext2","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:philpapers","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_philpapers","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:pile-cc","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_pile-cc","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:pubmed-abstracts","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_pubmed-abstracts","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:pubmed-abstracts","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"pubmed-abstracts","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:pubmed-central","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_pubmed-central","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:pubmed-central","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"pubmed-central","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:stackexchange","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_stackexchange","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:stackexchange","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"stackexchange","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:ubuntu-irc","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_ubuntu-irc","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:uspto","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_upsto","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:upsto","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"uspto","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:wikipedia","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_wikipedia","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:wikipedia","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"wikipedia","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:youtubesubtitles","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_youtubesubtitles","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:youtubesubtitles","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"youtubesubtitles","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"timedial","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"timedial","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"toxigen","suite":["lighteval"],"prompt_function":"toxigen","hf_repo":"skg/toxigen-data","hf_subset":"annotated","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"topical_chat","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"topical_chat","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","rouge_t5","loglikelihood_acc","bleurt"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"tracking_shuffled_objects","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"tracking_shuffled_objects","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"triviaqa","suite":["lighteval"],"prompt_function":"triviaqa","hf_repo":"trivia_qa","hf_subset":"rc.nocontext","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["quasi_exact_match_triviaqa"],"stop_sequence":["\n", ".", ","],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"truthfulqa:gen","suite":["lighteval"],"prompt_function":"truthful_qa_generative","hf_repo":"truthful_qa","hf_subset":"generation","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":200,"metric":["bleu","rouge_t5"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"truthfulqa:mc","suite":["leaderboard"],"prompt_function":"truthful_qa_multiple_choice","hf_repo":"truthful_qa","hf_subset":"multiple_choice","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["truthfulqa_mc_metrics"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"truthfulqa","suite":["helm","helm_general"],"prompt_function":"truthful_qa_helm","hf_repo":"lighteval\/truthfulqa_helm","hf_subset":"default","hf_avail_splits":["train","valid"],"evaluation_splits":["valid"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["loglikelihood_acc","exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"twitterAAE:aa","suite":["helm"],"prompt_function":"twitter_aae","hf_repo":"lighteval\/twitterAAE","hf_subset":"aa","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"twitterAAE:white","suite":["helm"],"prompt_function":"twitter_aae","hf_repo":"lighteval\/twitterAAE","hf_subset":"white","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"understanding_fables","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"understanding_fables","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"undo_permutation","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"undo_permutation","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"unit_conversion","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"unit_conversion","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"unit_interpretation","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"unit_interpretation","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"unnatural_in_context_learning","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"unnatural_in_context_learning","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"unscramble:anagrams1","suite":["lighteval","unscramble"],"prompt_function":"unscramble","hf_repo":"lighteval\/GPT3_unscramble","hf_subset":"default","hf_avail_splits":["mid_word_1_anagrams"],"evaluation_splits":["mid_word_1_anagrams"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"unscramble:anagrams2","suite":["lighteval","unscramble"],"prompt_function":"unscramble","hf_repo":"lighteval\/GPT3_unscramble","hf_subset":"default","hf_avail_splits":["mid_word_2_anagrams"],"evaluation_splits":["mid_word_2_anagrams"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"unscramble:cycle_letters","suite":["lighteval","unscramble"],"prompt_function":"unscramble","hf_repo":"lighteval\/GPT3_unscramble","hf_subset":"default","hf_avail_splits":["cycle_letters_in_word"],"evaluation_splits":["cycle_letters_in_word"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"unscramble:random_insertion","suite":["lighteval","unscramble"],"prompt_function":"unscramble","hf_repo":"lighteval\/GPT3_unscramble","hf_subset":"default","hf_avail_splits":["random_insertion_in_word"],"evaluation_splits":["random_insertion_in_word"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"unscramble:reversed_words","suite":["lighteval","unscramble"],"prompt_function":"unscramble","hf_repo":"lighteval\/GPT3_unscramble","hf_subset":"default","hf_avail_splits":["reversed_words"],"evaluation_splits":["reversed_words"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"vitaminc_fact_verification","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_whitespace_after_query","hf_repo":"bigbench","hf_subset":"vitaminc_fact_verification","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"webqs","suite":["lighteval"],"prompt_function":"webqs","hf_repo":"web_questions","hf_subset":"default","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"what_is_the_tao","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"what_is_the_tao","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"which_wiki_edit","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"which_wiki_edit","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:applies_to_jurisdiction","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"applies_to_jurisdiction","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:atomic_number","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"atomic_number","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:author","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"author","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:award_received","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"award_received","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:basic_form_of_government","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"basic_form_of_government","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:capital","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"capital","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:capital_of","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"capital_of","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:central_bank","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"central_bank","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:composer","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"composer","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:continent","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"continent","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:country","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"country","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:country_of_citizenship","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"country_of_citizenship","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:country_of_origin","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"country_of_origin","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:creator","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"creator","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:currency","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"currency","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:defendant","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"defendant","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:developer","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"developer","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:diplomatic_relation","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"diplomatic_relation","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:director","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"director","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:discoverer_or_inventor","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"discoverer_or_inventor","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:drug_or_therapy_used_for_treatment","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"drug_or_therapy_used_for_treatment","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:educated_at","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"educated_at","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:electron_configuration","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"electron_configuration","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:employer","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"employer","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:field_of_work","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"field_of_work","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:file_extension","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"file_extension","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:genetic_association","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"genetic_association","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:genre","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"genre","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:has_part","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"has_part","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:head_of_government","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"head_of_government","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:head_of_state","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"head_of_state","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:headquarters_location","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"headquarters_location","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:industry","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"industry","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:influenced_by","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"influenced_by","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:instance_of","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"instance_of","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:instrument","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"instrument","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:language_of_work_or_name","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"language_of_work_or_name","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:languages_spoken_written_or_signed","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"languages_spoken_written_or_signed","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:laws_applied","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"laws_applied","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:located_in_the_administrative_territorial_entity","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"located_in_the_administrative_territorial_entity","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:location","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"location","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:location_of_discovery","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"location_of_discovery","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:location_of_formation","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"location_of_formation","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:majority_opinion_by","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"majority_opinion_by","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:manufacturer","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"manufacturer","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:measured_physical_quantity","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"measured_physical_quantity","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:medical_condition_treated","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"medical_condition_treated","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:member_of","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"member_of","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:member_of_political_party","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"member_of_political_party","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:member_of_sports_team","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"member_of_sports_team","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:movement","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"movement","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:named_after","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"named_after","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:native_language","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"native_language","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:number_of_processor_cores","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"number_of_processor_cores","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:occupation","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"occupation","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:office_held_by_head_of_government","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"office_held_by_head_of_government","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:office_held_by_head_of_state","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"office_held_by_head_of_state","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:official_language","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"official_language","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:operating_system","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"operating_system","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:original_language_of_film_or_TV_show","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"original_language_of_film_or_TV_show","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:original_network","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"original_network","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:overrules","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"overrules","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:owned_by","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"owned_by","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:part_of","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"part_of","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:participating_team","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"participating_team","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:place_of_birth","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"place_of_birth","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:place_of_death","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"place_of_death","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:plaintiff","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"plaintiff","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:position_held","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"position_held","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:position_played_on_team","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"position_played_on_team","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:programming_language","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"programming_language","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:recommended_unit_of_measurement","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"recommended_unit_of_measurement","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:record_label","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"record_label","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:religion","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"religion","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:repealed_by","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"repealed_by","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:shares_border_with","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"shares_border_with","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:solved_by","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"solved_by","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:statement_describes","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"statement_describes","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:stock_exchange","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"stock_exchange","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:subclass_of","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"subclass_of","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:subsidiary","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"subsidiary","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:symptoms_and_signs","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"symptoms_and_signs","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:therapeutic_area","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"therapeutic_area","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:time_of_discovery_or_invention","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"time_of_discovery_or_invention","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:twinned_administrative_body","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"twinned_administrative_body","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:work_location","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"work_location","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikitext:2","suite":["lighteval"],"prompt_function":"wikitext","hf_repo":"wikitext","hf_subset":"wikitext-2-raw-v1","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikitext:103:document_level","suite":["harness"],"prompt_function":"wikitext_harness","hf_repo":"EleutherAI\/wikitext_document_level","hf_subset":"wikitext-103-raw-v1","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikitext:103:document_level","suite":["helm"],"prompt_function":"wikitext_helm","hf_repo":"EleutherAI\/wikitext_document_level","hf_subset":"wikitext-103-raw-v1","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wino_x_german","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"wino_x_german","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"winogrande","suite":["leaderboard"],"prompt_function":"winogrande","hf_repo":"winogrande","hf_subset":"winogrande_xl","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"winowhy","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_whitespace_after_query","hf_repo":"bigbench","hf_subset":"winowhy","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt08:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt08_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt08:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt08_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt08:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt08_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt08:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt08_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt08:en-es","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt08_en-es","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt08:en-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt08_en-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt08:en-hu","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt08_en-hu","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt08:es-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt08_es-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt08:fr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt08_fr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt08:hu-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt08_hu-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt09:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt09:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt09:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt09:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt09:en-es","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_en-es","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt09:en-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_en-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt09:en-hu","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_en-hu","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt09:en-it","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_en-it","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt09:es-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_es-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt09:fr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_fr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt09:hu-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_hu-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt09:it-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_it-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt10:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt10_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt10:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt10_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt10:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt10_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt10:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt10_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt10:en-es","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt10_en-es","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt10:en-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt10_en-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt10:es-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt10_es-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt10:fr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt10_fr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt11:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt11_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt11:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt11_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt11:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt11_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt11:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt11_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt11:en-es","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt11_en-es","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt11:en-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt11_en-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt11:es-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt11_es-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt11:fr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt11_fr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt12:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt12_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt12:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt12_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt12:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt12_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt12:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt12_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt12:en-es","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt12_en-es","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt12:en-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt12_en-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt12:es-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt12_es-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt12:fr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt12_fr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt13:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt13_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt13:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt13_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt13:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt13_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt13:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt13_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt13:en-es","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt13_en-es","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt13:en-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt13_en-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt13:en-ru","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt13_en-ru","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt13:es-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt13_es-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt13:fr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt13_fr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt13:ru-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt13_ru-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt14:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt14_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt14:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt14_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt14:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt14_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt14:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt14_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt14:en-fr","suite":["lighteval","gpt3_benchmarks"],"prompt_function":"wmt_alphabetical","hf_repo":"wmt14","hf_subset":"fr-en","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt14:en-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt14_en-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt14:en-hi","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt14_en-hi","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt14:en-ru","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt14_en-ru","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt14:fr-en","suite":["lighteval","gpt3_benchmarks"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"wmt14","hf_subset":"fr-en","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt14:fr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt14_fr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt14:hi-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt14_hi-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt14:ru-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt14_ru-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt14:cs-en","suite":["helm"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/wmt14","hf_subset":"cs-en","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["bleu"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt14:de-en","suite":["helm"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/wmt14","hf_subset":"de-en","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["bleu"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt14:fr-en","suite":["helm"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/wmt14","hf_subset":"fr-en","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["bleu"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt14:hi-en","suite":["helm"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/wmt14","hf_subset":"hi-en","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["bleu"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt14:ru-en","suite":["helm"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/wmt14","hf_subset":"ru-en","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["bleu"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt15:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt15_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt15:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt15_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt15:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt15_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt15:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt15_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt15:en-fi","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt15_en-fi","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt15:en-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt15_en-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt15:en-ru","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt15_en-ru","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt15:fi-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt15_fi-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt15:fr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt15_fr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt15:ru-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt15_ru-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt16:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt16:de-en","suite":["lighteval","gpt3_benchmarks"],"prompt_function":"wmt_alphabetical","hf_repo":"wmt16","hf_subset":"de-en","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt16:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt16:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt16:en-de","suite":["lighteval","gpt3_benchmarks"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"wmt16","hf_subset":"de-en","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt16:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt16:en-fi","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_en-fi","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt16:en-ro","suite":["lighteval","gpt3_benchmarks"],"prompt_function":"wmt_alphabetical","hf_repo":"wmt16","hf_subset":"ro-en","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt16:en-ro","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_en-ro","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt16:en-ru","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_en-ru","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt16:en-tr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_en-tr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt16:fi-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_fi-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt16:ro-en","suite":["lighteval","gpt3_benchmarks"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"wmt16","hf_subset":"ro-en","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt16:ro-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_ro-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt16:ru-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_ru-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt16:tr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_tr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt17:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt17:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt17:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt17:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt17:en-fi","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_en-fi","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt17:en-lv","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_en-lv","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt17:en-ru","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_en-ru","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt17:en-tr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_en-tr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt17:en-zh","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_en-zh","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt17:fi-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_fi-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt17:lv-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_lv-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt17:ru-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_ru-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt17:tr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_tr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt17:zh-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_zh-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt18:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt18:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt18:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt18:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt18:en-et","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_en-et","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt18:en-fi","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_en-fi","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt18:en-ru","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_en-ru","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt18:en-tr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_en-tr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt18:en-zh","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_en-zh","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt18:et-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_et-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt18:fi-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_fi-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt18:ru-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_ru-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt18:tr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_tr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt18:zh-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_zh-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt19:cs-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_cs-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt19:de-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_de-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt19:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt19:de-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_de-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt19:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt19:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt19:en-fi","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_en-fi","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt19:en-gu","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_en-gu","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt19:en-kk","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_en-kk","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt19:en-lt","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_en-lt","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt19:en-ru","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_en-ru","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt19:en-zh","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_en-zh","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt19:fi-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_fi-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt19:fr-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_fr-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt19:gu-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_gu-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt19:kk-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_kk-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt19:lt-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_lt-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt19:ru-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_ru-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt19:zh-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_zh-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt20:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt20:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt20:de-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_de-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt20:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt20:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt20:en-iu","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_en-iu","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt20:en-ja","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_en-ja","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt20:en-km","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_en-km","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt20:en-pl","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_en-pl","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt20:en-ps","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_en-ps","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt20:en-ru","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_en-ru","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt20:en-ta","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_en-ta","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt20:en-zh","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_en-zh","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt20:fr-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_fr-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt20:iu-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_iu-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt20:ja-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_ja-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt20:km-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_km-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt20:pl-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_pl-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt20:ps-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_ps-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt20:ru-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_ru-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt20:ta-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_ta-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt20:zh-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_zh-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"word_sorting","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"word_sorting","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"word_unscrambling","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"word_unscrambling","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wsc273","suite":["lighteval"],"prompt_function":"wsc273","hf_repo":"winograd_wsc","hf_subset":"wsc273","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xcopa:en","suite":["lighteval"],"prompt_function":"xcopa_en","hf_repo":"xcopa","hf_subset":"default","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xcopa:et","suite":["lighteval"],"prompt_function":"xcopa_et","hf_repo":"xcopa","hf_subset":"et","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xcopa:ht","suite":["lighteval"],"prompt_function":"xcopa_ht","hf_repo":"xcopa","hf_subset":"ht","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xcopa:it","suite":["lighteval"],"prompt_function":"xcopa_it","hf_repo":"xcopa","hf_subset":"it","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xcopa:id","suite":["lighteval"],"prompt_function":"xcopa_id","hf_repo":"xcopa","hf_subset":"id","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xcopa:qu","suite":["lighteval"],"prompt_function":"xcopa_qu","hf_repo":"xcopa","hf_subset":"qu","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xcopa:sw","suite":["lighteval"],"prompt_function":"xcopa_sw","hf_repo":"xcopa","hf_subset":"sw","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xcopa:zh","suite":["lighteval"],"prompt_function":"xcopa_zh","hf_repo":"xcopa","hf_subset":"zh","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xcopa:ta","suite":["lighteval"],"prompt_function":"xcopa_ta","hf_repo":"xcopa","hf_subset":"ta","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xcopa:th","suite":["lighteval"],"prompt_function":"xcopa_th","hf_repo":"xcopa","hf_subset":"th","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xcopa:tr","suite":["lighteval"],"prompt_function":"xcopa_tr","hf_repo":"xcopa","hf_subset":"tr","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xcopa:vi","suite":["lighteval"],"prompt_function":"xcopa_vi","hf_repo":"xcopa","hf_subset":"vi","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xstory_cloze:en","suite":["lighteval"],"prompt_function":"storycloze","hf_repo":"juletxara/xstory_cloze","hf_subset":"en","hf_avail_splits":["training", "eval"],"evaluation_splits":["eval"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xstory_cloze:ru","suite":["lighteval"],"prompt_function":"storycloze","hf_repo":"juletxara/xstory_cloze","hf_subset":"ru","hf_avail_splits":["training", "eval"],"evaluation_splits":["eval"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xstory_cloze:zh","suite":["lighteval"],"prompt_function":"storycloze","hf_repo":"juletxara/xstory_cloze","hf_subset":"zh","hf_avail_splits":["training", "eval"],"evaluation_splits":["eval"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xstory_cloze:es","suite":["lighteval"],"prompt_function":"storycloze","hf_repo":"juletxara/xstory_cloze","hf_subset":"es","hf_avail_splits":["training", "eval"],"evaluation_splits":["eval"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xstory_cloze:ar","suite":["lighteval"],"prompt_function":"storycloze","hf_repo":"juletxara/xstory_cloze","hf_subset":"ar","hf_avail_splits":["training", "eval"],"evaluation_splits":["eval"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xstory_cloze:hi","suite":["lighteval"],"prompt_function":"storycloze","hf_repo":"juletxara/xstory_cloze","hf_subset":"hi","hf_avail_splits":["training", "eval"],"evaluation_splits":["eval"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xstory_cloze:id","suite":["lighteval"],"prompt_function":"storycloze","hf_repo":"juletxara/xstory_cloze","hf_subset":"id","hf_avail_splits":["training", "eval"],"evaluation_splits":["eval"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xstory_cloze:te","suite":["lighteval"],"prompt_function":"storycloze","hf_repo":"juletxara/xstory_cloze","hf_subset":"te","hf_avail_splits":["training", "eval"],"evaluation_splits":["eval"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xstory_cloze:sw","suite":["lighteval"],"prompt_function":"storycloze","hf_repo":"juletxara/xstory_cloze","hf_subset":"sw","hf_avail_splits":["training", "eval"],"evaluation_splits":["eval"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xstory_cloze:eu","suite":["lighteval"],"prompt_function":"storycloze","hf_repo":"juletxara/xstory_cloze","hf_subset":"eu","hf_avail_splits":["training", "eval"],"evaluation_splits":["eval"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xstory_cloze:my","suite":["lighteval"],"prompt_function":"storycloze","hf_repo":"juletxara/xstory_cloze","hf_subset":"my","hf_avail_splits":["training", "eval"],"evaluation_splits":["eval"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xwinograd:en","suite":["lighteval"],"prompt_function":"winogrande","hf_repo":"Muennighoff/xwinograd","hf_subset":"en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xwinograd:fr","suite":["lighteval"],"prompt_function":"winogrande","hf_repo":"Muennighoff/xwinograd","hf_subset":"fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xwinograd:jp","suite":["lighteval"],"prompt_function":"winogrande","hf_repo":"Muennighoff/xwinograd","hf_subset":"jp","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xwinograd:pt","suite":["lighteval"],"prompt_function":"winogrande","hf_repo":"Muennighoff/xwinograd","hf_subset":"pt","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xwinograd:ru","suite":["lighteval"],"prompt_function":"winogrande","hf_repo":"Muennighoff/xwinograd","hf_subset":"ru","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xwinograd:zh","suite":["lighteval"],"prompt_function":"winogrande","hf_repo":"Muennighoff/xwinograd","hf_subset":"zh","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} diff --git a/src/lighteval/utils.py b/src/lighteval/utils.py index 16235785..3e032d1f 100644 --- a/src/lighteval/utils.py +++ b/src/lighteval/utils.py @@ -209,7 +209,7 @@ def is_openai_available() -> bool: def can_load_extended_tasks() -> bool: imports = [] - for package in ["langdetect"]: + for package in ["langdetect", "openai"]: imports.append(importlib.util.find_spec(package)) return all(cur_import is not None for cur_import in imports) From 4651531e4716911f9934b09d6b813fdbe18e7149 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Fourrier?= <22726840+clefourrier@users.noreply.github.com> Date: Tue, 9 Jul 2024 15:29:00 +0200 Subject: [PATCH 25/25] Now only uses functions for prompt definition (#213) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add function prompt assigment * add json casting * fix ruff setting + fmt * replaced json tasks by python tasks, step 1 * wip * simplification part 1 * fix extended tasks + typo * fix * fix nanotron example * small fix * now use function, not string, to pass prompts in examples * moved everyone to function calling * LightevalTask now only takes functions * removed templated type which messed up the test suite * last fix + doc udpate * Update src/lighteval/tasks/registry.py Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com> --------- Co-authored-by: Hynek Kydlíček Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- README.md | 12 +- community_tasks/_template.py | 36 +- community_tasks/aimo_evals.py | 21 +- community_tasks/arabic_evals.py | 232 +- community_tasks/german_rag_evals.py | 140 +- examples/nanotron/custom_evaluation_tasks.py | 341 +-- examples/nanotron/custom_task.py | 107 +- src/lighteval/logging/evaluation_tracker.py | 2 + src/lighteval/tasks/default_tasks.py | 2471 +++++++++-------- src/lighteval/tasks/extended/ifeval/main.py | 26 +- src/lighteval/tasks/extended/mt_bench/main.py | 42 +- .../tasks/extended/tiny_benchmarks/main.py | 13 +- src/lighteval/tasks/lighteval_task.py | 36 +- 14 files changed, 1730 insertions(+), 1751 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 441ff70a..0551f915 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -34,7 +34,7 @@ repos: - repo: https://github.com/charliermarsh/ruff-pre-commit # Ruff version. - rev: 'v0.1.6' + rev: 'v0.2.2' hooks: - id: ruff args: ['--fix'] diff --git a/README.md b/README.md index 8c6f1063..10364fe4 100644 --- a/README.md +++ b/README.md @@ -139,7 +139,7 @@ accelerate launch --multi_gpu --num_processes= run_evals_accelerate.py --output_dir output_dir ``` -You can find the template of the expected model configuration in [examples/model_configs/base_model.yaml_](./examples/model_configs/base_model.yaml). +You can find the template of the expected model configuration in [examples/model_configs/base_model.yaml_](./examples/model_configs/base_model.yaml). ### Evaluating a large model with pipeline parallelism @@ -197,7 +197,7 @@ There are two types of configuration files that can be provided for running on t 1. [endpoint_model.yaml](./examples/model_configs/endpoint_model.yaml): This configuration allows you to launch the model using [HuggingFace's Inference Endpoints](https://huggingface.co/inference-endpoints/dedicated). You can specify in the configuration file all the relevant parameters, and then `lighteval` will automatically deploy the endpoint, run the evaluation, and finally delete the endpoint (unless you specify an endpoint that was already launched, in which case the endpoint won't be deleted afterwards). -2. [tgi_model.yaml](./examples/model_configs/tgi_model.yaml): This configuration lets you specify the URL of a model running in a TGI container, such as one deployed on HuggingFace's serverless inference. +2. [tgi_model.yaml](./examples/model_configs/tgi_model.yaml): This configuration lets you specify the URL of a model running in a TGI container, such as one deployed on HuggingFace's serverless inference. Templates for these configurations can be found in [examples/model_configs](./examples/model_configs/). @@ -266,7 +266,7 @@ However, we are very grateful to the Harness and HELM teams for their continued - [logging](https://github.com/huggingface/lighteval/tree/main/src/lighteval/logging): Our loggers, to display experiment information and push it to the hub after a run - [metrics](https://github.com/huggingface/lighteval/tree/main/src/lighteval/metrics): All the available metrics you can use. They are described in metrics, and divided between sample metrics (applied at the sample level, such as prediction accuracy) and corpus metrics (applied over the whole corpus). You'll also find available normalisation functions. - [models](https://github.com/huggingface/lighteval/tree/main/src/lighteval/models): Possible models to use. We cover transformers (base_model), with adapter or delta weights, as well as TGI models locally deployed (it's likely the code here is out of date though), and brrr/nanotron models. - - [tasks](https://github.com/huggingface/lighteval/tree/main/src/lighteval/tasks): Available tasks. The complete list is in `tasks_table.jsonl`, and you'll find all the prompts in `tasks_prompt_formatting.py`. Popular tasks requiring custom logic are exceptionally added in the [extended tasks](https://github.com/huggingface/lighteval/blob/main/src/lighteval/tasks/extended). + - [tasks](https://github.com/huggingface/lighteval/tree/main/src/lighteval/tasks): Available tasks. The complete list is in `default_tasks.py`, and you'll find all the prompts in `tasks_prompt_formatting.py`. Popular tasks requiring custom logic are exceptionally added in the [extended tasks](https://github.com/huggingface/lighteval/blob/main/src/lighteval/tasks/extended). - [examples/tasks](https://github.com/huggingface/lighteval/tree/main/examples/tasks) contains a list of available tasks you can launch. We advise using tasks in the `recommended_set`, as it's possible that some of the other tasks need double checking. - [tests](https://github.com/huggingface/lighteval/tree/main/tests) contains our test suite, which we run at each PR to prevent regressions in metrics/prompts/tasks, for a subset of important tasks. @@ -285,10 +285,10 @@ A popular community evaluation can move to become an extended or core evaluation #### Core evaluations Prompt function: **find a suitable prompt function** in `src.lighteval.tasks.task_prompt_formatting.py`, or code your own. This function must output a `Doc` object, which should contain the `query`, your prompt, and either `gold`, the gold output, or `choices` and `gold_index`, the list of choices and index or indices of correct answers. If your query contains an instruction that should not be repeated in a few shot setup, add it to an `instruction` field. -Summary: create a **line summary** of your evaluation, in `src/lighteval/tasks/tasks_table.jsonl`. This summary should contain the following fields: +Summary: create a `LightevalTaskConfig` summary of your evaluation, in `src/lighteval/tasks/default_tasks.py`. This summary should contain the following fields: - `name` (str), your evaluation name - `suite` (list), the suite(s) to which your evaluation should belong. This field allows us to compare different task implementations and is used as a task selection to differentiate the versions to launch. At the moment, you'll find the keywords ["helm", "bigbench", "original", "lighteval", "community", "custom"]; for core evals, please choose `lighteval`. -- `prompt_function` (str), the name of the prompt function you defined in the step above +- `prompt_function` (Callable), the prompt function you defined in the step above - `hf_repo` (str), the path to your evaluation dataset on the hub - `hf_subset` (str), the specific subset you want to use for your evaluation (note: when the dataset has no subset, fill this field with `"default"`, not with `None` or `""`) - `hf_avail_splits` (list), all the splits available for your dataset (train, valid or validation, test, other...) @@ -310,7 +310,7 @@ Summary: create a **line summary** of your evaluation, in `src/lighteval/tasks/t Make sure you can launch your model with your new task using `--tasks lighteval|yournewtask|2|0`. #### Community evaluations -Copy the `community_tasks/_template.yml` to `community_tasks/yourevalname.py` and edit it to add your custom tasks (the parameters you can use are explained above). It contains an interesting mechanism if the dataset you are adding contains a lot of subsets. +Copy the `community_tasks/_template.py` to `community_tasks/yourevalname.py` and edit it to add your custom tasks (the parameters you can use are explained above). It contains an interesting mechanism if the dataset you are adding contains a lot of subsets. Make sure you can launch your model with your new task using `--tasks community|yournewtask|2|0 --custom_tasks community_tasks/yourevalname.py`. diff --git a/community_tasks/_template.py b/community_tasks/_template.py index fe0d8e1d..5025f741 100644 --- a/community_tasks/_template.py +++ b/community_tasks/_template.py @@ -39,12 +39,28 @@ from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES +# DEFINE YOUR PROMPT FUNCTIONS +# Define as many as you need for your different tasks +def prompt_fn(line, task_name: str = None): + """Defines how to go from a dataset line to a doc object. + Follow examples in src/lighteval/tasks/tasks_prompt_formatting.py, or get more info + about what this function should do in the README. + """ + return Doc( + task_name=task_name, + query="", + choices="", + gold_index=0, + instruction="", + ) + + # EVAL WITH NO SUBSET ## # This is how you create a simple task (like hellaswag) which has one single subset # attached to it, and one evaluation possible. task = LightevalTaskConfig( name="myothertask", - prompt_function="prompt_fn", # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py + prompt_function=prompt_fn, # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py suite=["community"], hf_repo="", hf_subset="default", @@ -73,7 +89,7 @@ def __init__( super().__init__( name=name, hf_subset=hf_subset, - prompt_function="prompt_fn", # must be defined in the file + prompt_function=prompt_fn, # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py hf_repo="", metric=[""], hf_avail_splits=[], @@ -88,22 +104,6 @@ def __init__( ) -# DEFINE YOUR PROMPT FUNCTIONS -# Define as many as you need for your different tasks -def prompt_fn(line, task_name: str = None): - """Defines how to go from a dataset line to a doc object. - Follow examples in src/lighteval/tasks/tasks_prompt_formatting.py, or get more info - about what this function should do in the README. - """ - return Doc( - task_name=task_name, - query="", - choices="", - gold_index=0, - instruction="", - ) - - # STORE YOUR EVALS SUBSET_TASKS = [CustomSubsetTask(name=f"mytask:{subset}", hf_subset=subset) for subset in SAMPLE_SUBSETS] TASKS_TABLE = SUBSET_TASKS + [task] diff --git a/community_tasks/aimo_evals.py b/community_tasks/aimo_evals.py index 5262a013..950becd5 100644 --- a/community_tasks/aimo_evals.py +++ b/community_tasks/aimo_evals.py @@ -29,9 +29,18 @@ from lighteval.tasks.requests import Doc +def aimo_prompt(line, task_name: str = None): + return Doc( + task_name=task_name, + choices=[str(line["answer"])], + gold_index=0, + query=line["problem"], + ) + + task = LightevalTaskConfig( name="aimo_progress_prize_1", - prompt_function="aimo_prompt", + prompt_function=aimo_prompt, suite=["community"], hf_subset="", hf_repo="lighteval/aimo_progress_prize_1", @@ -44,16 +53,6 @@ stop_sequence=None, ) - -def aimo_prompt(line, task_name: str = None): - return Doc( - task_name=task_name, - choices=[str(line["answer"])], - gold_index=0, - query=line["problem"], - ) - - # STORE YOUR EVALS TASKS_TABLE = [task] diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py index 495c95d9..b0aa15aa 100644 --- a/community_tasks/arabic_evals.py +++ b/community_tasks/arabic_evals.py @@ -53,6 +53,28 @@ # fmt: on +def mmlu_arabic(line, task_name: str = None): + topic = line["subject"] + instruction = f"الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح حول {topic.replace('_', ' ')}. \n\n" + choices = [line["A"], line["B"], line["C"], line["D"]] + # Answers are provided with roman letters - we look for the correct index in LETTER_INDICES, + # it will then be applied to arabic letters + gold_ix = LETTER_INDICES.index(line["answer"]) + + query = f"{instruction}{line['question']}\n" + query += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES_AR[:4], choices)]) + query += "الإجابة:" + + return Doc( + task_name=task_name, + query=query, + choices=LETTER_INDICES_AR[:4], + gold_index=gold_ix, + instruction=instruction, + target_for_fewshot_sorting=LETTER_INDICES_AR[gold_ix], + ) + + class CustomArabicMMLUTask(LightevalTaskConfig): def __init__( self, @@ -62,7 +84,7 @@ def __init__( super().__init__( name=name, hf_subset=hf_subset, - prompt_function="mmlu_arabic", + prompt_function=mmlu_arabic, hf_repo="OALL/Arabic_MMLU", metric=["loglikelihood_acc_norm"], hf_avail_splits=["test", "dev"], @@ -83,29 +105,6 @@ def __init__( CustomArabicMMLUTask(name=f"arabic_mmlu:{subset}", hf_subset=subset) for subset in ARABIC_MMLU_SUBSETS ] - -def mmlu_arabic(line, task_name: str = None): - topic = line["subject"] - instruction = f"الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح حول {topic.replace('_', ' ')}. \n\n" - choices = [line["A"], line["B"], line["C"], line["D"]] - # Answers are provided with roman letters - we look for the correct index in LETTER_INDICES, - # it will then be applied to arabic letters - gold_ix = LETTER_INDICES.index(line["answer"]) - - query = f"{instruction}{line['question']}\n" - query += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES_AR[:4], choices)]) - query += "الإجابة:" - - return Doc( - task_name=task_name, - query=query, - choices=LETTER_INDICES_AR[:4], - gold_index=gold_ix, - instruction=instruction, - target_for_fewshot_sorting=LETTER_INDICES_AR[gold_ix], - ) - - # ACVA ## # fmt: off ACVA_SUBSETS = [ @@ -121,6 +120,18 @@ def mmlu_arabic(line, task_name: str = None): # fmt: on +def acva(line, task_name: str = None): + question = line["question"] + answer = line["answer"] + + return Doc( + task_name=task_name, + query=f"السؤال: {question}\nالإجابة:", + choices=["صح", "خطأ"], + gold_index=["صح", "خطأ"].index(answer), + ) + + class CustomACVATask(LightevalTaskConfig): def __init__( self, @@ -130,7 +141,7 @@ def __init__( super().__init__( name=name, hf_subset=hf_subset, - prompt_function="acva", + prompt_function=acva, hf_repo="OALL/ACVA", metric=["loglikelihood_acc_norm"], hf_avail_splits=["test", "validation"], @@ -150,22 +161,33 @@ def __init__( ACVA_TASKS = [CustomACVATask(name=f"acva:{subset}", hf_subset=subset) for subset in ACVA_SUBSETS] -def acva(line, task_name: str = None): +def arabic_exams(line, task_name: str = None): + topic = line["subject"] question = line["question"] + choices = [line["A"], line["B"], line["C"], line["D"]] + choices_formatted = [f" {LETTER_INDICES_AR[i]}) {choice}\n" for i, choice in enumerate(choices)] answer = line["answer"] + answer_index = LETTER_INDICES.index(answer) + + instruction = f"الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح حول {topic.replace('_', ' ')}. \n\n" + query = f"{instruction}السؤال: {question}\n" + query += "\n".join(choices_formatted) + query += "\nالإجابة:" return Doc( task_name=task_name, - query=f"السؤال: {question}\nالإجابة:", - choices=["صح", "خطأ"], - gold_index=["صح", "خطأ"].index(answer), + query=query, + choices=LETTER_INDICES_AR[:4], + gold_index=answer_index, + instruction=instruction, + target_for_fewshot_sorting=choices[answer_index], ) # ARABIC EXAMS ## arabic_exams_task = LightevalTaskConfig( name="arabic_exams", - prompt_function="arabic_exams", + prompt_function=arabic_exams, suite=["community"], hf_repo="OALL/Arabic_EXAMS", hf_subset="default", @@ -179,39 +201,39 @@ def acva(line, task_name: str = None): ) -def arabic_exams(line, task_name: str = None): - topic = line["subject"] - question = line["question"] - choices = [line["A"], line["B"], line["C"], line["D"]] - choices_formatted = [f" {LETTER_INDICES_AR[i]}) {choice}\n" for i, choice in enumerate(choices)] - answer = line["answer"] - answer_index = LETTER_INDICES.index(answer) +# ALGHAFA NATIVE ## +# fmt: off +ALGHAFA_SUBSETS = [ + "mcq_exams_test_ar", "meta_ar_dialects", "meta_ar_msa", "multiple_choice_facts_truefalse_balanced_task", "multiple_choice_grounded_statement_soqal_task", + "multiple_choice_grounded_statement_xglue_mlqa_task", "multiple_choice_rating_sentiment_no_neutral_task", "multiple_choice_rating_sentiment_task", + "multiple_choice_sentiment_task" +] +# fmt: on - instruction = f"الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح حول {topic.replace('_', ' ')}. \n\n" + +def alghafa_prompt(line, task_name: str = None): + question = line["query"] + answer_index = int(line["label"]) + # Dynamically determining the choices by excluding '__few_shots', 'query' and 'label' + choices_keys = [key for key in line.keys() if key not in ["query", "label", "__few_shots"]] + choices = [line[key] for key in choices_keys] + + instruction = "الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح\n\n" query = f"{instruction}السؤال: {question}\n" - query += "\n".join(choices_formatted) - query += "\nالإجابة:" + for index, choice in enumerate(choices): + query += f"{index}) {choice}\n" + query += "الإجابة:" return Doc( task_name=task_name, query=query, - choices=LETTER_INDICES_AR[:4], + choices=choices, gold_index=answer_index, instruction=instruction, target_for_fewshot_sorting=choices[answer_index], ) -# ALGHAFA NATIVE ## -# fmt: off -ALGHAFA_SUBSETS = [ - "mcq_exams_test_ar", "meta_ar_dialects", "meta_ar_msa", "multiple_choice_facts_truefalse_balanced_task", "multiple_choice_grounded_statement_soqal_task", - "multiple_choice_grounded_statement_xglue_mlqa_task", "multiple_choice_rating_sentiment_no_neutral_task", "multiple_choice_rating_sentiment_task", - "multiple_choice_sentiment_task" -] -# fmt: on - - class CustomAlGhafaNativeTask(LightevalTaskConfig): def __init__( self, @@ -221,7 +243,7 @@ def __init__( super().__init__( name=name, hf_subset=hf_subset, - prompt_function="alghafa_prompt", + prompt_function=alghafa_prompt, hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Native", metric=["loglikelihood_acc_norm"], hf_avail_splits=["test", "validation"], @@ -239,35 +261,11 @@ def __init__( ALGHAFA_TASKS = [CustomAlGhafaNativeTask(name=f"alghafa:{subset}", hf_subset=subset) for subset in ALGHAFA_SUBSETS] - -def alghafa_prompt(line, task_name: str = None): - question = line["query"] - answer_index = int(line["label"]) - # Dynamically determining the choices by excluding '__few_shots', 'query' and 'label' - choices_keys = [key for key in line.keys() if key not in ["query", "label", "__few_shots"]] - choices = [line[key] for key in choices_keys] - - instruction = "الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح\n\n" - query = f"{instruction}السؤال: {question}\n" - for index, choice in enumerate(choices): - query += f"{index}) {choice}\n" - query += "الإجابة:" - - return Doc( - task_name=task_name, - query=query, - choices=choices, - gold_index=answer_index, - instruction=instruction, - target_for_fewshot_sorting=choices[answer_index], - ) - - # ALGHAFA TRANSLATED ## # race_ar race_ar_task = LightevalTaskConfig( name="race_ar", - prompt_function="alghafa_prompt", + prompt_function=alghafa_prompt, suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="race_ar", @@ -284,7 +282,7 @@ def alghafa_prompt(line, task_name: str = None): # piqa_ar piqa_ar_task = LightevalTaskConfig( name="piqa_ar", - prompt_function="alghafa_prompt", + prompt_function=alghafa_prompt, suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="piqa_ar", @@ -301,7 +299,7 @@ def alghafa_prompt(line, task_name: str = None): # arc_easy_ar arc_easy_ar_task = LightevalTaskConfig( name="arc_easy_ar", - prompt_function="alghafa_prompt", + prompt_function=alghafa_prompt, suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="arc_easy_ar", @@ -318,7 +316,7 @@ def alghafa_prompt(line, task_name: str = None): # arc_challenge_okapi_ar arc_challenge_okapi_ar_task = LightevalTaskConfig( name="arc_challenge_okapi_ar", - prompt_function="alghafa_prompt", + prompt_function=alghafa_prompt, suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="arc_challenge_okapi_ar", @@ -335,7 +333,7 @@ def alghafa_prompt(line, task_name: str = None): # mmlu_okapi_ar mmlu_okapi_ar_task = LightevalTaskConfig( name="mmlu_okapi_ar", - prompt_function="alghafa_prompt", + prompt_function=alghafa_prompt, suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="mmlu_okapi_ar", @@ -352,7 +350,7 @@ def alghafa_prompt(line, task_name: str = None): # openbook_qa_ext_ar openbook_qa_ext_ar_task = LightevalTaskConfig( name="openbook_qa_ext_ar", - prompt_function="alghafa_prompt", + prompt_function=alghafa_prompt, suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="openbook_qa_ext_ar", @@ -367,20 +365,6 @@ def alghafa_prompt(line, task_name: str = None): # boolq_ar -boolq_ar_task = LightevalTaskConfig( - name="boolq_ar", - prompt_function="boolq_prompt_arabic", - suite=["community"], - hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", - hf_subset="boolq_ar", - hf_avail_splits=["test", "validation"], - evaluation_splits=["test"], - few_shots_split="validation", - few_shots_select="sequential", - metric=["loglikelihood_acc_norm"], - trust_dataset=True, - version=0, -) def boolq_prompt_arabic(line, task_name: str = None): @@ -406,13 +390,12 @@ def boolq_prompt_arabic(line, task_name: str = None): ) -# copa_ext_ar -copa_ext_ar_task = LightevalTaskConfig( - name="copa_ext_ar", - prompt_function="copa_prompt_arabic", +boolq_ar_task = LightevalTaskConfig( + name="boolq_ar", + prompt_function=boolq_prompt_arabic, suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", - hf_subset="copa_ext_ar", + hf_subset="boolq_ar", hf_avail_splits=["test", "validation"], evaluation_splits=["test"], few_shots_split="validation", @@ -423,6 +406,7 @@ def boolq_prompt_arabic(line, task_name: str = None): ) +# copa_ext_ar def copa_prompt_arabic(line, task_name: str = None): premise = line["premise"] choices = [line["choice1"], line["choice2"]] @@ -442,13 +426,12 @@ def copa_prompt_arabic(line, task_name: str = None): ) -# hellaswag_okapi_ar -hellaswag_okapi_ar_task = LightevalTaskConfig( - name="hellaswag_okapi_ar", - prompt_function="hellaswag_prompt_arabic", +copa_ext_ar_task = LightevalTaskConfig( + name="copa_ext_ar", + prompt_function=copa_prompt_arabic, suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", - hf_subset="hellaswag_okapi_ar", + hf_subset="copa_ext_ar", hf_avail_splits=["test", "validation"], evaluation_splits=["test"], few_shots_split="validation", @@ -459,6 +442,7 @@ def copa_prompt_arabic(line, task_name: str = None): ) +# hellaswag_okapi_ar def hellaswag_prompt_arabic(line, task_name: str = None): ctx = re.sub(r"\[.*?\]", "", line["ctx"]) # Remove latin words within brackets endings = [ @@ -487,13 +471,12 @@ def hellaswag_prompt_arabic(line, task_name: str = None): ) -# toxigen_ar -toxigen_ar_task = LightevalTaskConfig( - name="toxigen_ar", - prompt_function="toxigen_prompt_arabic", +hellaswag_okapi_ar_task = LightevalTaskConfig( + name="hellaswag_okapi_ar", + prompt_function=hellaswag_prompt_arabic, suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", - hf_subset="toxigen_ar", + hf_subset="hellaswag_okapi_ar", hf_avail_splits=["test", "validation"], evaluation_splits=["test"], few_shots_split="validation", @@ -504,6 +487,7 @@ def hellaswag_prompt_arabic(line, task_name: str = None): ) +# toxigen_ar def toxigen_prompt_arabic(line, task_name: str = None): text = line["text"] label = 1 if ((line["toxicity_ai"] + line["toxicity_human"]) > 5.5) else 0 @@ -525,13 +509,12 @@ def toxigen_prompt_arabic(line, task_name: str = None): ) -# sciq_ar -sciq_ar_task = LightevalTaskConfig( - name="sciq_ar", - prompt_function="sciq_prompt_arabic", +toxigen_ar_task = LightevalTaskConfig( + name="toxigen_ar", + prompt_function=toxigen_prompt_arabic, suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", - hf_subset="sciq_ar", + hf_subset="toxigen_ar", hf_avail_splits=["test", "validation"], evaluation_splits=["test"], few_shots_split="validation", @@ -542,6 +525,7 @@ def toxigen_prompt_arabic(line, task_name: str = None): ) +# sciq_ar def sciq_prompt_arabic(line, task_name: str = None): support = line["support"] question = line["question"] @@ -577,6 +561,22 @@ def sciq_prompt_arabic(line, task_name: str = None): ) +sciq_ar_task = LightevalTaskConfig( + name="sciq_ar", + prompt_function=sciq_prompt_arabic, + suite=["community"], + hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", + hf_subset="sciq_ar", + hf_avail_splits=["test", "validation"], + evaluation_splits=["test"], + few_shots_split="validation", + few_shots_select="sequential", + metric=["loglikelihood_acc_norm"], + trust_dataset=True, + version=0, +) + + TASKS_TABLE = ( ARABIC_MMLU_TASKS + ACVA_TASKS diff --git a/community_tasks/german_rag_evals.py b/community_tasks/german_rag_evals.py index 0d2c76c0..82cdf2da 100644 --- a/community_tasks/german_rag_evals.py +++ b/community_tasks/german_rag_evals.py @@ -34,76 +34,6 @@ from lighteval.tasks.requests import Doc -# Task 1: Choose question by context. -# Given is a context and 4 questions. -# The task is to decide which question can be answered by the context. -task1 = LightevalTaskConfig( - name="german_rag_eval:choose_question_by_context", - prompt_function="prompt_fn_choose_question_by_context", - suite=["community"], - hf_repo="deutsche-telekom/Ger-RAG-eval", - hf_subset="task1", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split="test", - few_shots_select="sequential", - metric=["loglikelihood_acc"], - version=1, -) - -# Task 2: Choose context by question. -# Given is a question and 4 contexts. -# The task is to decide which context can answer the question. -task2 = LightevalTaskConfig( - name="german_rag_eval:choose_context_by_question", - prompt_function="prompt_fn_choose_context_by_question", - suite=["community"], - hf_repo="deutsche-telekom/Ger-RAG-eval", - hf_subset="task2", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split="test", - few_shots_select="sequential", - metric=["loglikelihood_acc"], - version=1, -) - - -# Task 3: Question-answer match. -# Given is a question and an answer. -# The task is to decide whether the answer actualy answers the question. -task3 = LightevalTaskConfig( - name="german_rag_eval:question_answer_match", - prompt_function="prompt_fn_question_answer_match", - suite=["community"], - hf_repo="deutsche-telekom/Ger-RAG-eval", - hf_subset="task3", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split="test", - few_shots_select="sequential", - metric=["loglikelihood_acc"], - version=1, -) - -# Task 4: Context-question match. -# Given is a context and a question. -# The task is to decide whether the question can be answered by the context or not. -task4 = LightevalTaskConfig( - name="german_rag_eval:context_question_match", - prompt_function="prompt_fn_context_question_match", - suite=["community"], - hf_repo="deutsche-telekom/Ger-RAG-eval", - hf_subset="task4", - hf_avail_splits=["test"], - evaluation_splits=["test"], - few_shots_split="test", - few_shots_select="sequential", - metric=["loglikelihood_acc"], - version=1, -) - - def prompt_fn_choose_question_by_context(line, task_name: str = None): instruction = "Welche der folgenden Fragen (A oder B oder C oder D) lässt sich anhand des Kontext beantworten?\n\n" query_template = """\ @@ -218,6 +148,76 @@ def prompt_fn_context_question_match(line, task_name: str = None): ) +# Task 1: Choose question by context. +# Given is a context and 4 questions. +# The task is to decide which question can be answered by the context. +task1 = LightevalTaskConfig( + name="german_rag_eval:choose_question_by_context", + prompt_function=prompt_fn_choose_question_by_context, + suite=["community"], + hf_repo="deutsche-telekom/Ger-RAG-eval", + hf_subset="task1", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split="test", + few_shots_select="sequential", + metric=["loglikelihood_acc"], + version=1, +) + +# Task 2: Choose context by question. +# Given is a question and 4 contexts. +# The task is to decide which context can answer the question. +task2 = LightevalTaskConfig( + name="german_rag_eval:choose_context_by_question", + prompt_function=prompt_fn_choose_context_by_question, + suite=["community"], + hf_repo="deutsche-telekom/Ger-RAG-eval", + hf_subset="task2", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split="test", + few_shots_select="sequential", + metric=["loglikelihood_acc"], + version=1, +) + + +# Task 3: Question-answer match. +# Given is a question and an answer. +# The task is to decide whether the answer actualy answers the question. +task3 = LightevalTaskConfig( + name="german_rag_eval:question_answer_match", + prompt_function=prompt_fn_question_answer_match, + suite=["community"], + hf_repo="deutsche-telekom/Ger-RAG-eval", + hf_subset="task3", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split="test", + few_shots_select="sequential", + metric=["loglikelihood_acc"], + version=1, +) + +# Task 4: Context-question match. +# Given is a context and a question. +# The task is to decide whether the question can be answered by the context or not. +task4 = LightevalTaskConfig( + name="german_rag_eval:context_question_match", + prompt_function=prompt_fn_context_question_match, + suite=["community"], + hf_repo="deutsche-telekom/Ger-RAG-eval", + hf_subset="task4", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split="test", + few_shots_select="sequential", + metric=["loglikelihood_acc"], + version=1, +) + + # STORE YOUR EVALS TASKS_TABLE = [task1, task2, task3, task4] diff --git a/examples/nanotron/custom_evaluation_tasks.py b/examples/nanotron/custom_evaluation_tasks.py index 62aa8dc4..3128335b 100644 --- a/examples/nanotron/custom_evaluation_tasks.py +++ b/examples/nanotron/custom_evaluation_tasks.py @@ -30,6 +30,7 @@ from dataclasses import asdict from typing import Dict, List, Tuple +import lighteval.tasks.tasks_prompt_formatting as prompt from lighteval.metrics import Metrics from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.requests import Doc @@ -39,11 +40,52 @@ _TASKS_STRINGS: List[Tuple[LightevalTaskConfig, str]] = [] _TASKS: List[LightevalTaskConfig] = [] + # COMMON_SENSE_REASONING_TASKS ## +def commonsense_qa_prompt(line, task_name: str = None): + return Doc( + task_name=task_name, + query=line["question"], + choices=[f" {c}" for c in line["choices"]["text"]], + gold_index=LETTER_INDICES.index(line["answerKey"].strip()), + instruction="", + ) + + +def siqa_prompt(line, task_name: str = None): + return Doc( + task_name=task_name, + query=line["context"] + " " + line["question"], + choices=[f" {c}" for c in [line["answerA"], line["answerB"], line["answerC"]]], + gold_index=int(line["label"]) - 1, + instruction="", + ) + + +def hellaswag_prompt(line, task_name: str = None): + def preprocess(text): + """Comes from AiHarness""" + # text = text.strip() + # NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag. + text = text.replace(" [title]", ". ") + text = re.sub("\\[.*?\\]", "", text) + text = text.replace(" ", " ") + return text + + ctx = f"{line['ctx_a']} {line['ctx_b'].capitalize()} " + return Doc( + task_name=task_name, + query=preprocess(line["activity_label"] + ": " + ctx), + choices=[" " + preprocess(ending) for ending in line["endings"]], + gold_index=int(line["label"]) if line["label"] != "" else -1, # -1 for test + # "metric": "choices_loglikelihood", + ) + + COMMON_SENSE_REASONING_TASKS = [ LightevalTaskConfig( name="hellaswag", - prompt_function="hellaswag_prompt", + prompt_function=hellaswag_prompt, hf_repo="hellaswag", hf_subset="default", metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], @@ -52,7 +94,7 @@ ), LightevalTaskConfig( name="winogrande", - prompt_function="winogrande", + prompt_function=prompt.winogrande, hf_repo="winogrande", hf_subset="winogrande_xl", metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], @@ -61,7 +103,7 @@ ), LightevalTaskConfig( name="piqa", - prompt_function="piqa_harness", + prompt_function=prompt.piqa_harness, hf_repo="piqa", hf_subset="plain_text", metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], @@ -70,7 +112,7 @@ ), LightevalTaskConfig( name="siqa", - prompt_function="siqa_prompt", + prompt_function=siqa_prompt, hf_repo="lighteval/siqa", hf_subset="default", hf_avail_splits=["train", "validation"], @@ -80,7 +122,7 @@ ), LightevalTaskConfig( name="openbookqa", - prompt_function="openbookqa", + prompt_function=prompt.openbookqa, hf_repo="openbookqa", hf_subset="main", metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], @@ -89,7 +131,7 @@ ), LightevalTaskConfig( name="arc:easy", - prompt_function="arc", + prompt_function=prompt.arc, hf_repo="ai2_arc", hf_subset="ARC-Easy", evaluation_splits=["test"], @@ -100,7 +142,7 @@ ), LightevalTaskConfig( name="arc:challenge", - prompt_function="arc", + prompt_function=prompt.arc, hf_repo="ai2_arc", hf_subset="ARC-Challenge", evaluation_splits=["test"], @@ -111,7 +153,7 @@ ), LightevalTaskConfig( name="commonsense_qa", - prompt_function="commonsense_qa_prompt", + prompt_function=commonsense_qa_prompt, hf_repo="commonsense_qa", hf_subset="default", metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], @@ -121,57 +163,27 @@ ] -def commonsense_qa_prompt(line, task_name: str = None): - return Doc( - task_name=task_name, - query=line["question"], - choices=[f" {c}" for c in line["choices"]["text"]], - gold_index=LETTER_INDICES.index(line["answerKey"].strip()), - instruction="", - ) +# 0 short for common sense +COMMON_SENSE_REASONING_STRING = [(t, f"custom|{t.name}|0|1") for t in COMMON_SENSE_REASONING_TASKS] +_TASKS_STRINGS.extend(COMMON_SENSE_REASONING_STRING) +_TASKS += COMMON_SENSE_REASONING_TASKS -def siqa_prompt(line, task_name: str = None): +# WORLD_KNOWLEDGE_TASKS ## +def natural_questions_prompt(line, task_name: str = None): return Doc( task_name=task_name, - query=line["context"] + " " + line["question"], - choices=[f" {c}" for c in [line["answerA"], line["answerB"], line["answerC"]]], - gold_index=int(line["label"]) - 1, + query=line["question"] + "?\nAnswer: ", + choices=[line["short_answers"]], + gold_index=0, instruction="", ) -def hellaswag_prompt(line, task_name: str = None): - def preprocess(text): - """Comes from AiHarness""" - # text = text.strip() - # NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag. - text = text.replace(" [title]", ". ") - text = re.sub("\\[.*?\\]", "", text) - text = text.replace(" ", " ") - return text - - ctx = f"{line['ctx_a']} {line['ctx_b'].capitalize()} " - return Doc( - task_name=task_name, - query=preprocess(line["activity_label"] + ": " + ctx), - choices=[" " + preprocess(ending) for ending in line["endings"]], - gold_index=int(line["label"]) if line["label"] != "" else -1, # -1 for test - # "metric": "choices_loglikelihood", - ) - - -# 0 short for common sense -COMMON_SENSE_REASONING_STRING = [(t, f"custom|{t.name}|0|1") for t in COMMON_SENSE_REASONING_TASKS] -_TASKS_STRINGS.extend(COMMON_SENSE_REASONING_STRING) -_TASKS += COMMON_SENSE_REASONING_TASKS - -# WORLD_KNOWLEDGE_TASKS ## - WORLD_KNOWLEDGE_TASKS = [ LightevalTaskConfig( name="trivia_qa", - prompt_function="triviaqa", + prompt_function=prompt.triviaqa, hf_repo="trivia_qa", hf_subset="rc.nocontext", metric=[Metrics.quasi_exact_match], @@ -181,7 +193,7 @@ def preprocess(text): ), LightevalTaskConfig( name="natural_questions", - prompt_function="natural_questions_prompt", + prompt_function=natural_questions_prompt, hf_repo="lighteval/natural_questions_clean", hf_subset="default", metric=[Metrics.quasi_exact_match], @@ -192,27 +204,26 @@ def preprocess(text): ] -def natural_questions_prompt(line, task_name: str = None): - return Doc( - task_name=task_name, - query=line["question"] + "?\nAnswer: ", - choices=[line["short_answers"]], - gold_index=0, - instruction="", - ) - - WORLD_KNOWLEDGE_STRING = [(t, f"custom|{t.name}|5|1") for t in WORLD_KNOWLEDGE_TASKS] # WORLD_KNOWLEDGE_STRING = {t: f'custom|{t.name}|0|1' for t in WORLD_KNOWLEDGE_TASKS} _TASKS_STRINGS.extend(WORLD_KNOWLEDGE_STRING) _TASKS += WORLD_KNOWLEDGE_TASKS + # Reading comprehension ## +def boolq_prompt(line, task_name: str = None): + return Doc( + task_name=task_name, + query=f"{line['passage']}\nQuestion: {line['question'].capitalize()}?\nAnswer:", + choices=[" No", " Yes"], # Only gold + gold_index=int(line["label"]), + ) + READING_COMP_TASKS = [ LightevalTaskConfig( name="super_glue:boolq", - prompt_function="boolq_prompt", + prompt_function=boolq_prompt, hf_repo="super_glue", hf_subset="boolq", metric=["target_perplexity"], @@ -221,7 +232,7 @@ def natural_questions_prompt(line, task_name: str = None): ), LightevalTaskConfig( name="quac", - prompt_function="quac", + prompt_function=prompt.quac, hf_repo="lighteval/quac_helm", hf_subset="deault", metric=[Metrics.quasi_exact_match], @@ -232,15 +243,6 @@ def natural_questions_prompt(line, task_name: str = None): ] -def boolq_prompt(line, task_name: str = None): - return Doc( - task_name=task_name, - query=f"{line['passage']}\nQuestion: {line['question'].capitalize()}?\nAnswer:", - choices=[" No", " Yes"], # Only gold - gold_index=int(line["label"]), - ) - - READING_COMP_STRING = [(t, f"custom|{t.name}|0|1") for t in READING_COMP_TASKS] _TASKS_STRINGS.extend(READING_COMP_STRING) _TASKS += READING_COMP_TASKS @@ -253,7 +255,7 @@ class CustomMathEvaluationTask(LightevalTaskConfig): def __init__( self, name, - prompt_function="math", + prompt_function=prompt.math, hf_repo="lighteval/MATH", hf_subset=None, metric=[Metrics.quasi_exact_match_math], @@ -298,7 +300,7 @@ def __init__( ] GSM8K = LightevalTaskConfig( name="gsm8k", - prompt_function="gsm8k", + prompt_function=prompt.gsm8k, hf_repo="gsm8k", hf_subset="main", hf_avail_splits=["train", "test"], @@ -317,11 +319,46 @@ def __init__( # MMLU ## +def mmlu_harness(line, task_name: str = None): + topic = line["subject"] + prompt = f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n" + prompt += line["question"] + "\n" + prompt += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES, line["choices"])]) + prompt += "Answer:" + + gold_ix = LETTER_INDICES.index(line["answer"]) if isinstance(line["answer"], str) else line["answer"] + "__few_shots" in line and line["__few_shots"] is True # We are adding few shots + + return Doc( + task_name=task_name, + query=prompt, + choices=[" A", " B", " C", " D"], + target_for_fewshot_sorting=[" A", " B", " C", " D"][gold_ix], + gold_index=gold_ix, + instruction=f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n", + ) + + +def mmlu_prompt(line, task_name: str = None): + """MMLU prompt without letters""" + topic = line["subject"] + prompt = f"The following are questions about {topic.replace('_', ' ')}.\nQuestion: " + prompt += line["question"] + "\nAnswer:" + + return Doc( + task_name=task_name, + query=prompt, + choices=[f" {c}" for c in line["choices"]], + gold_index=line["answer"], + instruction=f"The following are questions about {topic.replace('_', ' ')}.\n", + ) + + class CustomMMLUEvaluationTask(LightevalTaskConfig): def __init__( self, name, - prompt_function="mmlu_prompt", + prompt_function=mmlu_prompt, hf_repo="lighteval/mmlu", hf_subset=None, # metric=[Metrics.loglikelihood_acc_single_token], @@ -419,54 +456,27 @@ def __init__( ] -def mmlu_harness(line, task_name: str = None): - topic = line["subject"] - prompt = f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n" - prompt += line["question"] + "\n" - prompt += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES, line["choices"])]) - prompt += "Answer:" - - gold_ix = LETTER_INDICES.index(line["answer"]) if isinstance(line["answer"], str) else line["answer"] - "__few_shots" in line and line["__few_shots"] is True # We are adding few shots - - return Doc( - task_name=task_name, - query=prompt, - choices=[" A", " B", " C", " D"], - target_for_fewshot_sorting=[" A", " B", " C", " D"][gold_ix], - gold_index=gold_ix, - instruction=f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n", - ) - - -def mmlu_prompt(line, task_name: str = None): - """MMLU prompt without letters""" - topic = line["subject"] - prompt = f"The following are questions about {topic.replace('_', ' ')}.\nQuestion: " - prompt += line["question"] + "\nAnswer:" - - return Doc( - task_name=task_name, - query=prompt, - choices=[f" {c}" for c in line["choices"]], - gold_index=line["answer"], - instruction=f"The following are questions about {topic.replace('_', ' ')}.\n", - ) - - # MMLU_STRING = {t: f'custom|{t.name}|5|1' for t in MMLU_TASKS} MMLU_STRING = [(t, f"custom|{t.name}|0|1") for t in MMLU_TASKS] _TASKS_STRINGS.extend(MMLU_STRING) _TASKS += MMLU_TASKS + # BBH ## +def bbh_prompt(line, task_name: str = None): + return Doc( + task_name=task_name, + query=line["input"] + "\nAnswer: ", + choices=[line["target"]], + gold_index=0, + ) class CustomBBHEvaluationTask(LightevalTaskConfig): def __init__( self, name, - prompt_function="bbh_prompt", + prompt_function=bbh_prompt, hf_repo="lighteval/big_bench_hard", hf_subset=None, metric=[Metrics.exact_match], @@ -539,27 +549,69 @@ def __init__( ] -def bbh_prompt(line, task_name: str = None): +# BBH_STRING = {t: f'custom|{t.name}|3|1' for t in BBH_TASKS} +BBH_STRING = [(t, f"custom|{t.name}|0|1") for t in BBH_TASKS] +_TASKS_STRINGS.extend(BBH_STRING) +_TASKS += BBH_TASKS + + +# AGI eval ## + + +def agi_eval_math_prompt(line, task_name: str = None): return Doc( task_name=task_name, - query=line["input"] + "\nAnswer: ", - choices=[line["target"]], + query=line["question"], + choices=[line["answer"]], gold_index=0, + instruction="", ) -# BBH_STRING = {t: f'custom|{t.name}|3|1' for t in BBH_TASKS} -BBH_STRING = [(t, f"custom|{t.name}|0|1") for t in BBH_TASKS] -_TASKS_STRINGS.extend(BBH_STRING) -_TASKS += BBH_TASKS +def agi_eval_prompt(line, task_name: str = None): + cleaned_options = [o.replace("(", "").replace(")", " ") for o in line["options"]] + prompt = "The following are multiple choice questions (with answers).\n\n" + prompt += line["question"] + "\n" + "\n".join(cleaned_options) + "\n" + prompt += "Answer: " + + choices = LETTER_INDICES[: len(line["options"])] + + output = Doc( + query=prompt, + instruction="The following are multiple choice questions (with answers).\n\n", + ) + + if line["label"]: + output.choices = choices + output.gold_index = LETTER_INDICES.index(line["label"].strip()) + else: + output.choices = [line["answer"]] + output.gold_index = 0 + + return output + + +def agi_eval_prompt_no_letters(line, task_name: str = None): + cleaned_options = [ + " " + o.replace("(A)", "").replace("(B)", "").replace("(C)", "").replace("(D)", "").replace("(E)", "") + for o in line["options"] + ] + + output = Doc( + query=line["question"], + choices=cleaned_options, + gold_index=LETTER_INDICES.index(line["label"].strip()), + instruction="", + ) + + return output -# AGI eval ## class CustomAGIEvalEvaluationTask(LightevalTaskConfig): def __init__( self, name, - prompt_function="agi_eval_prompt_no_letters", + prompt_function=agi_eval_prompt_no_letters, hf_repo="lighteval/agi_eval_en", hf_subset=None, # metric=[Metrics.loglikelihood_acc_single_token], @@ -603,7 +655,7 @@ def __init__( CustomAGIEvalEvaluationTask( name="agi_eval:math", hf_subset="math", - prompt_function="agi_eval_math_prompt", + prompt_function=agi_eval_math_prompt, metric=[Metrics.exact_match, Metrics.quasi_exact_match], generation_size=40, ), @@ -612,55 +664,6 @@ def __init__( ] -def agi_eval_math_prompt(line, task_name: str = None): - return Doc( - task_name=task_name, - query=line["question"], - choices=[line["answer"]], - gold_index=0, - instruction="", - ) - - -def agi_eval_prompt(line, task_name: str = None): - cleaned_options = [o.replace("(", "").replace(")", " ") for o in line["options"]] - prompt = "The following are multiple choice questions (with answers).\n\n" - prompt += line["question"] + "\n" + "\n".join(cleaned_options) + "\n" - prompt += "Answer: " - - choices = LETTER_INDICES[: len(line["options"])] - - output = Doc( - query=prompt, - instruction="The following are multiple choice questions (with answers).\n\n", - ) - - if line["label"]: - output.choices = choices - output.gold_index = LETTER_INDICES.index(line["label"].strip()) - else: - output.choices = [line["answer"]] - output.gold_index = 0 - - return output - - -def agi_eval_prompt_no_letters(line, task_name: str = None): - cleaned_options = [ - " " + o.replace("(A)", "").replace("(B)", "").replace("(C)", "").replace("(D)", "").replace("(E)", "") - for o in line["options"] - ] - - output = Doc( - query=line["question"], - choices=cleaned_options, - gold_index=LETTER_INDICES.index(line["label"].strip()), - instruction="", - ) - - return output - - # AGIEVAL_STRING = {t: f'custom|{t.name}|5|1' for t in AGIEVAL_TASKS} AGIEVAL_STRING = [(t, f"custom|{t.name}|0|1") for t in AGIEVAL_TASKS] _TASKS_STRINGS.extend(AGIEVAL_STRING) @@ -670,7 +673,7 @@ def agi_eval_prompt_no_letters(line, task_name: str = None): # HUMAN EVAL ## # human_eval = LightevalTaskConfig( # name="human_eval", -# prompt_function="human_eval", +# prompt_function=prompt.human_eval", # hf_repo="lighteval/human_eval", # metric=["human_eval_pass_at_1"], # ), diff --git a/examples/nanotron/custom_task.py b/examples/nanotron/custom_task.py index 77f43c65..ccbae7b9 100644 --- a/examples/nanotron/custom_task.py +++ b/examples/nanotron/custom_task.py @@ -20,81 +20,84 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -TASKS_TABLE = [ - { - "name": "mmlu:anatomy", - "suite": ["custom"], - "prompt_function": "mmlu_anatomy", - "hf_repo": "lighteval/mmlu", - "hf_subset": "anatomy", - "hf_avail_splits": ["auxiliary_train", "test", "validation", "dev"], - "evaluation_splits": ["test"], - "few_shots_split": "dev", - "few_shots_select": "sequential", - "generation_size": 5, - "metric": ["loglikelihood_acc_single_token"], - "stop_sequence": ["\n"], - "output_regex": None, - "frozen": False, - }, - { - "name": "mmlu:anatomy_signs", - "suite": ["custom"], - "prompt_function": "mmlu_anatomy_signs", - "hf_repo": "lighteval/mmlu", - "hf_subset": "anatomy", - "hf_avail_splits": ["auxiliary_train", "test", "validation", "dev"], - "evaluation_splits": ["test"], - "few_shots_split": "dev", - "few_shots_select": "sequential", - "generation_size": 5, - "metric": ["loglikelihood_acc_single_token"], - "stop_sequence": ["\n"], - "output_regex": None, - "frozen": False, - }, -] - - -def mmlu_anatomy_signs(line): - return mmlu_signs(line, "anatomy") +from lighteval.tasks.lighteval_task import LightevalTaskConfig -def mmlu_anatomy(line): - return mmlu_numbers(line, "anatomy") - - -def mmlu_numbers(line, topic): +def mmlu_signs(line, topic): prompt = f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n" prompt += line["question"] + "\n" - prompt += "".join([f"{key}. {choice}\n" for key, choice in zip(["1", "2", "3", "4"], line["choices"])]) + prompt += "".join([f"{key}. {choice}\n" for key, choice in zip(["+", "*", "=", "#"], line["choices"])]) prompt += "Answer:" - gold_ix = ["1", "2", "3", "4"].index(line["answer"]) if isinstance(line["answer"], str) else line["answer"] + gold_ix = ["+", "*", "=", "#"].index(line["answer"]) if isinstance(line["answer"], str) else line["answer"] is_few_shots = line.get("__few_shots", False) # We are adding few shots return { "query": prompt, - "choices": [" 1", " 2", " 3", " 4"] if is_few_shots else ["1", "2", "3", "4"], - "target_for_fewshot_sorting": [" 1", " 2", " 3", " 4"][gold_ix], + "choices": [" +", " *", " =", " #"] if is_few_shots else ["+", "*", "=", "#"], + "target_for_fewshot_sorting": [" +", " *", " =", " #"][gold_ix], "gold_index": gold_ix, "instruction": f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n", } -def mmlu_signs(line, topic): +def mmlu_anatomy_signs(line): + return mmlu_signs(line, "anatomy") + + +def mmlu_numbers(line, topic): prompt = f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n" prompt += line["question"] + "\n" - prompt += "".join([f"{key}. {choice}\n" for key, choice in zip(["+", "*", "=", "#"], line["choices"])]) + prompt += "".join([f"{key}. {choice}\n" for key, choice in zip(["1", "2", "3", "4"], line["choices"])]) prompt += "Answer:" - gold_ix = ["+", "*", "=", "#"].index(line["answer"]) if isinstance(line["answer"], str) else line["answer"] + gold_ix = ["1", "2", "3", "4"].index(line["answer"]) if isinstance(line["answer"], str) else line["answer"] is_few_shots = line.get("__few_shots", False) # We are adding few shots return { "query": prompt, - "choices": [" +", " *", " =", " #"] if is_few_shots else ["+", "*", "=", "#"], - "target_for_fewshot_sorting": [" +", " *", " =", " #"][gold_ix], + "choices": [" 1", " 2", " 3", " 4"] if is_few_shots else ["1", "2", "3", "4"], + "target_for_fewshot_sorting": [" 1", " 2", " 3", " 4"][gold_ix], "gold_index": gold_ix, "instruction": f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n", } + + +def mmlu_anatomy(line): + return mmlu_numbers(line, "anatomy") + + +TASKS_TABLE = [ + LightevalTaskConfig( + name="mmlu:anatomy", + suite=["custom"], + prompt_function=mmlu_anatomy, + hf_repo="lighteval/mmlu", + hf_subset="anatomy", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=5, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + ), + LightevalTaskConfig( + name="mmlu:anatomy_signs", + suite=["custom"], + prompt_function=mmlu_anatomy_signs, + hf_repo="lighteval/mmlu", + hf_subset="anatomy", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=5, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + ), +] diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py index b1dbe616..453d57e0 100644 --- a/src/lighteval/logging/evaluation_tracker.py +++ b/src/lighteval/logging/evaluation_tracker.py @@ -57,6 +57,8 @@ class EnhancedJSONEncoder(json.JSONEncoder): def default(self, o): if is_dataclass(o): return asdict(o) + if callable(o): + return o.__name__ return super().default(o) diff --git a/src/lighteval/tasks/default_tasks.py b/src/lighteval/tasks/default_tasks.py index dbfdfe09..468f12d5 100644 --- a/src/lighteval/tasks/default_tasks.py +++ b/src/lighteval/tasks/default_tasks.py @@ -19,13 +19,14 @@ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +import lighteval.tasks.tasks_prompt_formatting as prompt from lighteval.tasks.lighteval_task import LightevalTaskConfig abstract_narrative_understanding_bigbench = LightevalTaskConfig( name="abstract_narrative_understanding", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="abstract_narrative_understanding", hf_avail_splits=["default", "train", "validation"], @@ -43,7 +44,7 @@ agieval_aqua_rat_lighteval = LightevalTaskConfig( name="agieval:aqua-rat", suite=["lighteval"], - prompt_function="agieval", + prompt_function=prompt.agieval, hf_repo="dmayhem93/agieval-aqua-rat", hf_subset="default", hf_avail_splits=["test"], @@ -61,7 +62,7 @@ agieval_gaokao_biology_lighteval = LightevalTaskConfig( name="agieval:gaokao-biology", suite=["lighteval"], - prompt_function="agieval", + prompt_function=prompt.agieval, hf_repo="dmayhem93/agieval-gaokao-biology", hf_subset="default", hf_avail_splits=["test"], @@ -79,7 +80,7 @@ agieval_gaokao_chemistry_lighteval = LightevalTaskConfig( name="agieval:gaokao-chemistry", suite=["lighteval"], - prompt_function="agieval", + prompt_function=prompt.agieval, hf_repo="dmayhem93/agieval-gaokao-chemistry", hf_subset="default", hf_avail_splits=["test"], @@ -97,7 +98,7 @@ agieval_gaokao_chinese_lighteval = LightevalTaskConfig( name="agieval:gaokao-chinese", suite=["lighteval"], - prompt_function="agieval", + prompt_function=prompt.agieval, hf_repo="dmayhem93/agieval-gaokao-chinese", hf_subset="default", hf_avail_splits=["test"], @@ -115,7 +116,7 @@ agieval_gaokao_english_lighteval = LightevalTaskConfig( name="agieval:gaokao-english", suite=["lighteval"], - prompt_function="agieval", + prompt_function=prompt.agieval, hf_repo="dmayhem93/agieval-gaokao-english", hf_subset="default", hf_avail_splits=["test"], @@ -133,7 +134,7 @@ agieval_gaokao_geography_lighteval = LightevalTaskConfig( name="agieval:gaokao-geography", suite=["lighteval"], - prompt_function="agieval", + prompt_function=prompt.agieval, hf_repo="dmayhem93/agieval-gaokao-geography", hf_subset="default", hf_avail_splits=["test"], @@ -151,7 +152,7 @@ agieval_gaokao_history_lighteval = LightevalTaskConfig( name="agieval:gaokao-history", suite=["lighteval"], - prompt_function="agieval", + prompt_function=prompt.agieval, hf_repo="dmayhem93/agieval-gaokao-history", hf_subset="default", hf_avail_splits=["test"], @@ -169,7 +170,7 @@ agieval_gaokao_mathqa_lighteval = LightevalTaskConfig( name="agieval:gaokao-mathqa", suite=["lighteval"], - prompt_function="agieval", + prompt_function=prompt.agieval, hf_repo="dmayhem93/agieval-gaokao-mathqa", hf_subset="default", hf_avail_splits=["test"], @@ -187,7 +188,7 @@ agieval_gaokao_physics_lighteval = LightevalTaskConfig( name="agieval:gaokao-physics", suite=["lighteval"], - prompt_function="agieval", + prompt_function=prompt.agieval, hf_repo="dmayhem93/agieval-gaokao-physics", hf_subset="default", hf_avail_splits=["test"], @@ -205,7 +206,7 @@ agieval_logiqa_en_lighteval = LightevalTaskConfig( name="agieval:logiqa-en", suite=["lighteval"], - prompt_function="agieval", + prompt_function=prompt.agieval, hf_repo="dmayhem93/agieval-logiqa-en", hf_subset="default", hf_avail_splits=["test"], @@ -223,7 +224,7 @@ agieval_logiqa_zh_lighteval = LightevalTaskConfig( name="agieval:logiqa-zh", suite=["lighteval"], - prompt_function="agieval", + prompt_function=prompt.agieval, hf_repo="dmayhem93/agieval-logiqa-zh", hf_subset="default", hf_avail_splits=["test"], @@ -241,7 +242,7 @@ agieval_lsat_ar_lighteval = LightevalTaskConfig( name="agieval:lsat-ar", suite=["lighteval"], - prompt_function="agieval", + prompt_function=prompt.agieval, hf_repo="dmayhem93/agieval-lsat-ar", hf_subset="default", hf_avail_splits=["test"], @@ -259,7 +260,7 @@ agieval_lsat_lr_lighteval = LightevalTaskConfig( name="agieval:lsat-lr", suite=["lighteval"], - prompt_function="agieval", + prompt_function=prompt.agieval, hf_repo="dmayhem93/agieval-lsat-lr", hf_subset="default", hf_avail_splits=["test"], @@ -277,7 +278,7 @@ agieval_lsat_rc_lighteval = LightevalTaskConfig( name="agieval:lsat-rc", suite=["lighteval"], - prompt_function="agieval", + prompt_function=prompt.agieval, hf_repo="dmayhem93/agieval-lsat-rc", hf_subset="default", hf_avail_splits=["test"], @@ -295,7 +296,7 @@ agieval_sat_en_lighteval = LightevalTaskConfig( name="agieval:sat-en", suite=["lighteval"], - prompt_function="agieval", + prompt_function=prompt.agieval, hf_repo="dmayhem93/agieval-sat-en", hf_subset="default", hf_avail_splits=["test"], @@ -313,7 +314,7 @@ agieval_sat_en_without_passage_lighteval = LightevalTaskConfig( name="agieval:sat-en-without-passage", suite=["lighteval"], - prompt_function="agieval", + prompt_function=prompt.agieval, hf_repo="dmayhem93/agieval-sat-en-without-passage", hf_subset="default", hf_avail_splits=["test"], @@ -331,7 +332,7 @@ agieval_sat_math_lighteval = LightevalTaskConfig( name="agieval:sat-math", suite=["lighteval"], - prompt_function="agieval", + prompt_function=prompt.agieval, hf_repo="dmayhem93/agieval-sat-math", hf_subset="default", hf_avail_splits=["test"], @@ -349,7 +350,7 @@ anachronisms_bigbench = LightevalTaskConfig( name="anachronisms", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="anachronisms", hf_avail_splits=["default", "train", "validation"], @@ -367,7 +368,7 @@ analogical_similarity_bigbench = LightevalTaskConfig( name="analogical_similarity", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="analogical_similarity", hf_avail_splits=["default", "train", "validation"], @@ -385,7 +386,7 @@ analytic_entailment_bigbench = LightevalTaskConfig( name="analytic_entailment", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="analytic_entailment", hf_avail_splits=["default", "train", "validation"], @@ -403,7 +404,7 @@ anli_lighteval = LightevalTaskConfig( name="anli", suite=["lighteval", "anli"], - prompt_function="anli", + prompt_function=prompt.anli, hf_repo="anli", hf_subset="plain_text", hf_avail_splits=[ @@ -431,7 +432,7 @@ anli_r1_lighteval = LightevalTaskConfig( name="anli:r1", suite=["lighteval", "anli"], - prompt_function="anli", + prompt_function=prompt.anli, hf_repo="anli", hf_subset="plain_text", hf_avail_splits=["train_r1", "dev_r1", "test_r1"], @@ -449,7 +450,7 @@ anli_r2_lighteval = LightevalTaskConfig( name="anli:r2", suite=["lighteval", "anli"], - prompt_function="anli", + prompt_function=prompt.anli, hf_repo="anli", hf_subset="plain_text", hf_avail_splits=["train_r2", "dev_r2", "test_r2"], @@ -467,7 +468,7 @@ anli_r3_lighteval = LightevalTaskConfig( name="anli:r3", suite=["lighteval", "anli"], - prompt_function="anli", + prompt_function=prompt.anli, hf_repo="anli", hf_subset="plain_text", hf_avail_splits=["train_r3", "dev_r3", "test_r3"], @@ -485,7 +486,7 @@ arc_c_letters_original = LightevalTaskConfig( name="arc:c:letters", suite=["original", "arc"], - prompt_function="arc_with_options_letters_predict", + prompt_function=prompt.arc_with_options_letters_predict, hf_repo="ai2_arc", hf_subset="ARC-Challenge", hf_avail_splits=["train", "validation", "test"], @@ -503,7 +504,7 @@ arc_c_options_original = LightevalTaskConfig( name="arc:c:options", suite=["original", "arc"], - prompt_function="arc_with_options", + prompt_function=prompt.arc_with_options, hf_repo="ai2_arc", hf_subset="ARC-Challenge", hf_avail_splits=["train", "validation", "test"], @@ -521,7 +522,7 @@ arc_c_simple_original = LightevalTaskConfig( name="arc:c:simple", suite=["original", "arc"], - prompt_function="arc", + prompt_function=prompt.arc, hf_repo="ai2_arc", hf_subset="ARC-Challenge", hf_avail_splits=["train", "validation", "test"], @@ -539,7 +540,7 @@ arc_challenge_leaderboard = LightevalTaskConfig( name="arc:challenge", suite=["leaderboard", "arc"], - prompt_function="arc", + prompt_function=prompt.arc, hf_repo="ai2_arc", hf_subset="ARC-Challenge", hf_avail_splits=["train", "test"], @@ -557,7 +558,7 @@ arc_easy_lighteval = LightevalTaskConfig( name="arc:easy", suite=["lighteval", "arc"], - prompt_function="arc", + prompt_function=prompt.arc, hf_repo="ai2_arc", hf_subset="ARC-Easy", hf_avail_splits=["train", "validation", "test"], @@ -575,7 +576,7 @@ arithmetic_1dc_lighteval = LightevalTaskConfig( name="arithmetic:1dc", suite=["lighteval", "arithmetic"], - prompt_function="arithmetic", + prompt_function=prompt.arithmetic, hf_repo="EleutherAI/arithmetic", hf_subset="arithmetic_1dc", hf_avail_splits=["validation"], @@ -593,7 +594,7 @@ arithmetic_2da_lighteval = LightevalTaskConfig( name="arithmetic:2da", suite=["lighteval", "arithmetic"], - prompt_function="arithmetic", + prompt_function=prompt.arithmetic, hf_repo="EleutherAI/arithmetic", hf_subset="arithmetic_2da", hf_avail_splits=["validation"], @@ -611,7 +612,7 @@ arithmetic_2dm_lighteval = LightevalTaskConfig( name="arithmetic:2dm", suite=["lighteval", "arithmetic"], - prompt_function="arithmetic", + prompt_function=prompt.arithmetic, hf_repo="EleutherAI/arithmetic", hf_subset="arithmetic_2dm", hf_avail_splits=["validation"], @@ -629,7 +630,7 @@ arithmetic_2ds_lighteval = LightevalTaskConfig( name="arithmetic:2ds", suite=["lighteval", "arithmetic"], - prompt_function="arithmetic", + prompt_function=prompt.arithmetic, hf_repo="EleutherAI/arithmetic", hf_subset="arithmetic_2ds", hf_avail_splits=["validation"], @@ -647,7 +648,7 @@ arithmetic_3da_lighteval = LightevalTaskConfig( name="arithmetic:3da", suite=["lighteval", "arithmetic"], - prompt_function="arithmetic", + prompt_function=prompt.arithmetic, hf_repo="EleutherAI/arithmetic", hf_subset="arithmetic_3da", hf_avail_splits=["validation"], @@ -665,7 +666,7 @@ arithmetic_3ds_lighteval = LightevalTaskConfig( name="arithmetic:3ds", suite=["lighteval", "arithmetic"], - prompt_function="arithmetic", + prompt_function=prompt.arithmetic, hf_repo="EleutherAI/arithmetic", hf_subset="arithmetic_3ds", hf_avail_splits=["validation"], @@ -683,7 +684,7 @@ arithmetic_4da_lighteval = LightevalTaskConfig( name="arithmetic:4da", suite=["lighteval", "arithmetic"], - prompt_function="arithmetic", + prompt_function=prompt.arithmetic, hf_repo="EleutherAI/arithmetic", hf_subset="arithmetic_4da", hf_avail_splits=["validation"], @@ -701,7 +702,7 @@ arithmetic_4ds_lighteval = LightevalTaskConfig( name="arithmetic:4ds", suite=["lighteval", "arithmetic"], - prompt_function="arithmetic", + prompt_function=prompt.arithmetic, hf_repo="EleutherAI/arithmetic", hf_subset="arithmetic_4ds", hf_avail_splits=["validation"], @@ -719,7 +720,7 @@ arithmetic_5da_lighteval = LightevalTaskConfig( name="arithmetic:5da", suite=["lighteval", "arithmetic"], - prompt_function="arithmetic", + prompt_function=prompt.arithmetic, hf_repo="EleutherAI/arithmetic", hf_subset="arithmetic_5da", hf_avail_splits=["validation"], @@ -737,7 +738,7 @@ arithmetic_5ds_lighteval = LightevalTaskConfig( name="arithmetic:5ds", suite=["lighteval", "arithmetic"], - prompt_function="arithmetic", + prompt_function=prompt.arithmetic, hf_repo="EleutherAI/arithmetic", hf_subset="arithmetic_5ds", hf_avail_splits=["validation"], @@ -755,7 +756,7 @@ arithmetic_bb_bigbench = LightevalTaskConfig( name="arithmetic_bb", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="arithmetic", hf_avail_splits=["default", "train", "validation"], @@ -773,7 +774,7 @@ ascii_word_recognition_bigbench = LightevalTaskConfig( name="ascii_word_recognition", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="ascii_word_recognition", hf_avail_splits=["default", "train", "validation"], @@ -791,7 +792,7 @@ asdiv_lighteval = LightevalTaskConfig( name="asdiv", suite=["lighteval"], - prompt_function="asdiv", + prompt_function=prompt.asdiv, hf_repo="EleutherAI/asdiv", hf_subset="asdiv", hf_avail_splits=["validation"], @@ -809,7 +810,7 @@ authorship_verification_bigbench = LightevalTaskConfig( name="authorship_verification", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="authorship_verification", hf_avail_splits=["default", "train", "validation"], @@ -827,7 +828,7 @@ auto_categorization_bigbench = LightevalTaskConfig( name="auto_categorization", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="auto_categorization", hf_avail_splits=["default", "train", "validation"], @@ -845,7 +846,7 @@ auto_debugging_bigbench_lite = LightevalTaskConfig( name="auto_debugging", suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function="bigbench_linefeed_before_and_after_query", + prompt_function=prompt.bigbench_linefeed_before_and_after_query, hf_repo="bigbench", hf_subset="auto_debugging", hf_avail_splits=["default", "train", "validation"], @@ -862,7 +863,7 @@ babi_qa_helm = LightevalTaskConfig( name="babi_qa", suite=["helm"], - prompt_function="babi_qa", + prompt_function=prompt.babi_qa, hf_repo="facebook/babi_qa", hf_subset="en-valid-qa1", hf_avail_splits=["train", "test", "validation"], @@ -880,7 +881,7 @@ bigbench_causal_judgment_lighteval = LightevalTaskConfig( name="bigbench:causal_judgment", suite=["lighteval"], - prompt_function="bbh_lighteval", + prompt_function=prompt.bbh_lighteval, hf_repo="lighteval/bbh", hf_subset="causal_judgement", hf_avail_splits=["train"], @@ -898,7 +899,7 @@ bigbench_date_understanding_lighteval = LightevalTaskConfig( name="bigbench:date_understanding", suite=["lighteval"], - prompt_function="bbh_lighteval", + prompt_function=prompt.bbh_lighteval, hf_repo="lighteval/bbh", hf_subset="date_understanding", hf_avail_splits=["train"], @@ -916,7 +917,7 @@ bigbench_disambiguation_qa_lighteval = LightevalTaskConfig( name="bigbench:disambiguation_qa", suite=["lighteval"], - prompt_function="bbh_lighteval", + prompt_function=prompt.bbh_lighteval, hf_repo="lighteval/bbh", hf_subset="disambiguation_qa", hf_avail_splits=["train"], @@ -934,7 +935,7 @@ bigbench_geometric_shapes_lighteval = LightevalTaskConfig( name="bigbench:geometric_shapes", suite=["lighteval"], - prompt_function="bbh_lighteval", + prompt_function=prompt.bbh_lighteval, hf_repo="lighteval/bbh", hf_subset="geometric_shapes", hf_avail_splits=["train"], @@ -952,7 +953,7 @@ bigbench_logical_deduction_five_objects_lighteval = LightevalTaskConfig( name="bigbench:logical_deduction_five_objects", suite=["lighteval"], - prompt_function="bbh_lighteval", + prompt_function=prompt.bbh_lighteval, hf_repo="lighteval/bbh", hf_subset="logical_deduction_five_objects", hf_avail_splits=["train"], @@ -970,7 +971,7 @@ bigbench_logical_deduction_seven_objects_lighteval = LightevalTaskConfig( name="bigbench:logical_deduction_seven_objects", suite=["lighteval"], - prompt_function="bbh_lighteval", + prompt_function=prompt.bbh_lighteval, hf_repo="lighteval/bbh", hf_subset="logical_deduction_seven_objects", hf_avail_splits=["train"], @@ -988,7 +989,7 @@ bigbench_logical_deduction_three_objects_lighteval = LightevalTaskConfig( name="bigbench:logical_deduction_three_objects", suite=["lighteval"], - prompt_function="bbh_lighteval", + prompt_function=prompt.bbh_lighteval, hf_repo="lighteval/bbh", hf_subset="logical_deduction_three_objects", hf_avail_splits=["train"], @@ -1006,7 +1007,7 @@ bigbench_movie_recommendation_lighteval = LightevalTaskConfig( name="bigbench:movie_recommendation", suite=["lighteval"], - prompt_function="bbh_lighteval", + prompt_function=prompt.bbh_lighteval, hf_repo="lighteval/bbh", hf_subset="movie_recommendation", hf_avail_splits=["train"], @@ -1024,7 +1025,7 @@ bigbench_navigate_lighteval = LightevalTaskConfig( name="bigbench:navigate", suite=["lighteval"], - prompt_function="bbh_lighteval", + prompt_function=prompt.bbh_lighteval, hf_repo="lighteval/bbh", hf_subset="navigate", hf_avail_splits=["train"], @@ -1042,7 +1043,7 @@ bigbench_reasoning_about_colored_objects_lighteval = LightevalTaskConfig( name="bigbench:reasoning_about_colored_objects", suite=["lighteval"], - prompt_function="bbh_lighteval", + prompt_function=prompt.bbh_lighteval, hf_repo="lighteval/bbh", hf_subset="reasoning_about_colored_objects", hf_avail_splits=["train"], @@ -1060,7 +1061,7 @@ bigbench_ruin_names_lighteval = LightevalTaskConfig( name="bigbench:ruin_names", suite=["lighteval"], - prompt_function="bbh_lighteval", + prompt_function=prompt.bbh_lighteval, hf_repo="lighteval/bbh", hf_subset="ruin_names", hf_avail_splits=["train"], @@ -1078,7 +1079,7 @@ bigbench_salient_translation_error_detection_lighteval = LightevalTaskConfig( name="bigbench:salient_translation_error_detection", suite=["lighteval"], - prompt_function="bbh_lighteval", + prompt_function=prompt.bbh_lighteval, hf_repo="lighteval/bbh", hf_subset="salient_translation_error_detection", hf_avail_splits=["train"], @@ -1096,7 +1097,7 @@ bigbench_snarks_lighteval = LightevalTaskConfig( name="bigbench:snarks", suite=["lighteval"], - prompt_function="bbh_lighteval", + prompt_function=prompt.bbh_lighteval, hf_repo="lighteval/bbh", hf_subset="snarks", hf_avail_splits=["train"], @@ -1114,7 +1115,7 @@ bigbench_sports_understanding_lighteval = LightevalTaskConfig( name="bigbench:sports_understanding", suite=["lighteval"], - prompt_function="bbh_lighteval", + prompt_function=prompt.bbh_lighteval, hf_repo="lighteval/bbh", hf_subset="sports_understanding", hf_avail_splits=["train"], @@ -1132,7 +1133,7 @@ bigbench_temporal_sequences_lighteval = LightevalTaskConfig( name="bigbench:temporal_sequences", suite=["lighteval"], - prompt_function="bbh_lighteval", + prompt_function=prompt.bbh_lighteval, hf_repo="lighteval/bbh", hf_subset="temporal_sequences", hf_avail_splits=["train"], @@ -1150,7 +1151,7 @@ bigbench_tracking_shuffled_objects_five_objects_lighteval = LightevalTaskConfig( name="bigbench:tracking_shuffled_objects_five_objects", suite=["lighteval"], - prompt_function="bbh_lighteval", + prompt_function=prompt.bbh_lighteval, hf_repo="lighteval/bbh", hf_subset="tracking_shuffled_objects_five_objects", hf_avail_splits=["train"], @@ -1168,7 +1169,7 @@ bigbench_tracking_shuffled_objects_seven_objects_lighteval = LightevalTaskConfig( name="bigbench:tracking_shuffled_objects_seven_objects", suite=["lighteval"], - prompt_function="bbh_lighteval", + prompt_function=prompt.bbh_lighteval, hf_repo="lighteval/bbh", hf_subset="tracking_shuffled_objects_seven_objects", hf_avail_splits=["train"], @@ -1186,7 +1187,7 @@ bigbench_tracking_shuffled_objects_three_objects_lighteval = LightevalTaskConfig( name="bigbench:tracking_shuffled_objects_three_objects", suite=["lighteval"], - prompt_function="bbh_lighteval", + prompt_function=prompt.bbh_lighteval, hf_repo="lighteval/bbh", hf_subset="tracking_shuffled_objects_three_objects", hf_avail_splits=["train"], @@ -1204,7 +1205,7 @@ bigbench_causal_judgment_harness = LightevalTaskConfig( name="bigbench:causal_judgment", suite=["harness"], - prompt_function="bbh_harness", + prompt_function=prompt.bbh_harness, hf_repo="lighteval/bbh", hf_subset="causal_judgement", hf_avail_splits=["train"], @@ -1223,7 +1224,7 @@ bigbench_date_understanding_harness = LightevalTaskConfig( name="bigbench:date_understanding", suite=["harness"], - prompt_function="bbh_harness", + prompt_function=prompt.bbh_harness, hf_repo="lighteval/bbh", hf_subset="date_understanding", hf_avail_splits=["train"], @@ -1242,7 +1243,7 @@ bigbench_disambiguation_qa_harness = LightevalTaskConfig( name="bigbench:disambiguation_qa", suite=["harness"], - prompt_function="bbh_harness", + prompt_function=prompt.bbh_harness, hf_repo="lighteval/bbh", hf_subset="disambiguation_qa", hf_avail_splits=["train"], @@ -1261,7 +1262,7 @@ bigbench_geometric_shapes_harness = LightevalTaskConfig( name="bigbench:geometric_shapes", suite=["harness"], - prompt_function="bbh_harness", + prompt_function=prompt.bbh_harness, hf_repo="lighteval/bbh", hf_subset="geometric_shapes", hf_avail_splits=["train"], @@ -1280,7 +1281,7 @@ bigbench_logical_deduction_five_objects_harness = LightevalTaskConfig( name="bigbench:logical_deduction_five_objects", suite=["harness"], - prompt_function="bbh_harness", + prompt_function=prompt.bbh_harness, hf_repo="lighteval/bbh", hf_subset="logical_deduction_five_objects", hf_avail_splits=["train"], @@ -1299,7 +1300,7 @@ bigbench_logical_deduction_seven_objects_harness = LightevalTaskConfig( name="bigbench:logical_deduction_seven_objects", suite=["harness"], - prompt_function="bbh_harness", + prompt_function=prompt.bbh_harness, hf_repo="lighteval/bbh", hf_subset="logical_deduction_seven_objects", hf_avail_splits=["train"], @@ -1318,7 +1319,7 @@ bigbench_logical_deduction_three_objects_harness = LightevalTaskConfig( name="bigbench:logical_deduction_three_objects", suite=["harness"], - prompt_function="bbh_harness", + prompt_function=prompt.bbh_harness, hf_repo="lighteval/bbh", hf_subset="logical_deduction_three_objects", hf_avail_splits=["train"], @@ -1337,7 +1338,7 @@ bigbench_movie_recommendation_harness = LightevalTaskConfig( name="bigbench:movie_recommendation", suite=["harness"], - prompt_function="bbh_harness", + prompt_function=prompt.bbh_harness, hf_repo="lighteval/bbh", hf_subset="movie_recommendation", hf_avail_splits=["train"], @@ -1356,7 +1357,7 @@ bigbench_navigate_harness = LightevalTaskConfig( name="bigbench:navigate", suite=["harness"], - prompt_function="bbh_harness", + prompt_function=prompt.bbh_harness, hf_repo="lighteval/bbh", hf_subset="navigate", hf_avail_splits=["train"], @@ -1375,7 +1376,7 @@ bigbench_reasoning_about_colored_objects_harness = LightevalTaskConfig( name="bigbench:reasoning_about_colored_objects", suite=["harness"], - prompt_function="bbh_harness", + prompt_function=prompt.bbh_harness, hf_repo="lighteval/bbh", hf_subset="reasoning_about_colored_objects", hf_avail_splits=["train"], @@ -1394,7 +1395,7 @@ bigbench_ruin_names_harness = LightevalTaskConfig( name="bigbench:ruin_names", suite=["harness"], - prompt_function="bbh_harness", + prompt_function=prompt.bbh_harness, hf_repo="lighteval/bbh", hf_subset="ruin_names", hf_avail_splits=["train"], @@ -1413,7 +1414,7 @@ bigbench_salient_translation_error_detection_harness = LightevalTaskConfig( name="bigbench:salient_translation_error_detection", suite=["harness"], - prompt_function="bbh_harness", + prompt_function=prompt.bbh_harness, hf_repo="lighteval/bbh", hf_subset="salient_translation_error_detection", hf_avail_splits=["train"], @@ -1432,7 +1433,7 @@ bigbench_snarks_harness = LightevalTaskConfig( name="bigbench:snarks", suite=["harness"], - prompt_function="bbh_harness", + prompt_function=prompt.bbh_harness, hf_repo="lighteval/bbh", hf_subset="snarks", hf_avail_splits=["train"], @@ -1451,7 +1452,7 @@ bigbench_sports_understanding_harness = LightevalTaskConfig( name="bigbench:sports_understanding", suite=["harness"], - prompt_function="bbh_harness", + prompt_function=prompt.bbh_harness, hf_repo="lighteval/bbh", hf_subset="sports_understanding", hf_avail_splits=["train"], @@ -1470,7 +1471,7 @@ bigbench_temporal_sequences_harness = LightevalTaskConfig( name="bigbench:temporal_sequences", suite=["harness"], - prompt_function="bbh_harness", + prompt_function=prompt.bbh_harness, hf_repo="lighteval/bbh", hf_subset="temporal_sequences", hf_avail_splits=["train"], @@ -1489,7 +1490,7 @@ bigbench_tracking_shuffled_objects_five_objects_harness = LightevalTaskConfig( name="bigbench:tracking_shuffled_objects_five_objects", suite=["harness"], - prompt_function="bbh_harness", + prompt_function=prompt.bbh_harness, hf_repo="lighteval/bbh", hf_subset="tracking_shuffled_objects_five_objects", hf_avail_splits=["train"], @@ -1508,7 +1509,7 @@ bigbench_tracking_shuffled_objects_seven_objects_harness = LightevalTaskConfig( name="bigbench:tracking_shuffled_objects_seven_objects", suite=["harness"], - prompt_function="bbh_harness", + prompt_function=prompt.bbh_harness, hf_repo="lighteval/bbh", hf_subset="tracking_shuffled_objects_seven_objects", hf_avail_splits=["train"], @@ -1527,7 +1528,7 @@ bigbench_tracking_shuffled_objects_three_objects_harness = LightevalTaskConfig( name="bigbench:tracking_shuffled_objects_three_objects", suite=["harness"], - prompt_function="bbh_harness", + prompt_function=prompt.bbh_harness, hf_repo="lighteval/bbh", hf_subset="tracking_shuffled_objects_three_objects", hf_avail_splits=["train"], @@ -1546,7 +1547,7 @@ bbh_boolean_expressions_harness = LightevalTaskConfig( name="bbh:boolean_expressions", suite=["harness"], - prompt_function="bbh_boolean_expressions", + prompt_function=prompt.bbh_boolean_expressions, hf_repo="lukaemon/bbh", hf_subset="boolean_expressions", hf_avail_splits=["test"], @@ -1570,7 +1571,7 @@ bbh_causal_judgment_harness = LightevalTaskConfig( name="bbh:causal_judgment", suite=["harness"], - prompt_function="bbh_causal_judgment", + prompt_function=prompt.bbh_causal_judgment, hf_repo="lukaemon/bbh", hf_subset="causal_judgement", hf_avail_splits=["test"], @@ -1594,7 +1595,7 @@ bbh_date_understanding_harness = LightevalTaskConfig( name="bbh:date_understanding", suite=["harness"], - prompt_function="bbh_date_understanding", + prompt_function=prompt.bbh_date_understanding, hf_repo="lukaemon/bbh", hf_subset="date_understanding", hf_avail_splits=["test"], @@ -1618,7 +1619,7 @@ bbh_disambiguation_qa_harness = LightevalTaskConfig( name="bbh:disambiguation_qa", suite=["harness"], - prompt_function="bbh_disambiguation_qa", + prompt_function=prompt.bbh_disambiguation_qa, hf_repo="lukaemon/bbh", hf_subset="disambiguation_qa", hf_avail_splits=["test"], @@ -1642,7 +1643,7 @@ bbh_dyck_languages_harness = LightevalTaskConfig( name="bbh:dyck_languages", suite=["harness"], - prompt_function="bbh_dyck_languages", + prompt_function=prompt.bbh_dyck_languages, hf_repo="lukaemon/bbh", hf_subset="dyck_languages", hf_avail_splits=["test"], @@ -1666,7 +1667,7 @@ bbh_formal_fallacies_harness = LightevalTaskConfig( name="bbh:formal_fallacies", suite=["harness"], - prompt_function="bbh_formal_fallacies", + prompt_function=prompt.bbh_formal_fallacies, hf_repo="lukaemon/bbh", hf_subset="formal_fallacies", hf_avail_splits=["test"], @@ -1690,7 +1691,7 @@ bbh_geometric_shapes_harness = LightevalTaskConfig( name="bbh:geometric_shapes", suite=["harness"], - prompt_function="bbh_geometric_shapes", + prompt_function=prompt.bbh_geometric_shapes, hf_repo="lukaemon/bbh", hf_subset="geometric_shapes", hf_avail_splits=["test"], @@ -1714,7 +1715,7 @@ bbh_hyperbaton_harness = LightevalTaskConfig( name="bbh:hyperbaton", suite=["harness"], - prompt_function="bbh_hyperbaton", + prompt_function=prompt.bbh_hyperbaton, hf_repo="lukaemon/bbh", hf_subset="hyperbaton", hf_avail_splits=["test"], @@ -1738,7 +1739,7 @@ bbh_logical_deduction_five_objects_harness = LightevalTaskConfig( name="bbh:logical_deduction_five_objects", suite=["harness"], - prompt_function="bbh_logical_deduction_five_objects", + prompt_function=prompt.bbh_logical_deduction_five_objects, hf_repo="lukaemon/bbh", hf_subset="logical_deduction_five_objects", hf_avail_splits=["test"], @@ -1762,7 +1763,7 @@ bbh_logical_deduction_seven_objects_harness = LightevalTaskConfig( name="bbh:logical_deduction_seven_objects", suite=["harness"], - prompt_function="bbh_logical_deduction_seven_objects", + prompt_function=prompt.bbh_logical_deduction_seven_objects, hf_repo="lukaemon/bbh", hf_subset="logical_deduction_seven_objects", hf_avail_splits=["test"], @@ -1786,7 +1787,7 @@ bbh_logical_deduction_three_objects_harness = LightevalTaskConfig( name="bbh:logical_deduction_three_objects", suite=["harness"], - prompt_function="bbh_logical_deduction_three_objects", + prompt_function=prompt.bbh_logical_deduction_three_objects, hf_repo="lukaemon/bbh", hf_subset="logical_deduction_three_objects", hf_avail_splits=["test"], @@ -1810,7 +1811,7 @@ bbh_movie_recommendation_harness = LightevalTaskConfig( name="bbh:movie_recommendation", suite=["harness"], - prompt_function="bbh_movie_recommendation", + prompt_function=prompt.bbh_movie_recommendation, hf_repo="lukaemon/bbh", hf_subset="movie_recommendation", hf_avail_splits=["test"], @@ -1834,7 +1835,7 @@ bbh_multistep_arithmetic_two_harness = LightevalTaskConfig( name="bbh:multistep_arithmetic_two", suite=["harness"], - prompt_function="bbh_multistep_arithmetic_two", + prompt_function=prompt.bbh_multistep_arithmetic_two, hf_repo="lukaemon/bbh", hf_subset="multistep_arithmetic_two", hf_avail_splits=["test"], @@ -1858,7 +1859,7 @@ bbh_navigate_harness = LightevalTaskConfig( name="bbh:navigate", suite=["harness"], - prompt_function="bbh_navigate", + prompt_function=prompt.bbh_navigate, hf_repo="lukaemon/bbh", hf_subset="navigate", hf_avail_splits=["test"], @@ -1882,7 +1883,7 @@ bbh_object_counting_harness = LightevalTaskConfig( name="bbh:object_counting", suite=["harness"], - prompt_function="bbh_object_counting", + prompt_function=prompt.bbh_object_counting, hf_repo="lukaemon/bbh", hf_subset="object_counting", hf_avail_splits=["test"], @@ -1906,7 +1907,7 @@ bbh_penguins_in_a_table_harness = LightevalTaskConfig( name="bbh:penguins_in_a_table", suite=["harness"], - prompt_function="bbh_penguins_in_a_table", + prompt_function=prompt.bbh_penguins_in_a_table, hf_repo="lukaemon/bbh", hf_subset="penguins_in_a_table", hf_avail_splits=["test"], @@ -1930,7 +1931,7 @@ bbh_reasoning_about_colored_objects_harness = LightevalTaskConfig( name="bbh:reasoning_about_colored_objects", suite=["harness"], - prompt_function="bbh_reasoning_about_colored_objects", + prompt_function=prompt.bbh_reasoning_about_colored_objects, hf_repo="lukaemon/bbh", hf_subset="reasoning_about_colored_objects", hf_avail_splits=["test"], @@ -1954,7 +1955,7 @@ bbh_ruin_names_harness = LightevalTaskConfig( name="bbh:ruin_names", suite=["harness"], - prompt_function="bbh_ruin_names", + prompt_function=prompt.bbh_ruin_names, hf_repo="lukaemon/bbh", hf_subset="ruin_names", hf_avail_splits=["test"], @@ -1978,7 +1979,7 @@ bbh_salient_translation_error_detection_harness = LightevalTaskConfig( name="bbh:salient_translation_error_detection", suite=["harness"], - prompt_function="bbh_salient_translation_error_detection", + prompt_function=prompt.bbh_salient_translation_error_detection, hf_repo="lukaemon/bbh", hf_subset="salient_translation_error_detection", hf_avail_splits=["test"], @@ -2002,7 +2003,7 @@ bbh_snarks_harness = LightevalTaskConfig( name="bbh:snarks", suite=["harness"], - prompt_function="bbh_snarks", + prompt_function=prompt.bbh_snarks, hf_repo="lukaemon/bbh", hf_subset="snarks", hf_avail_splits=["test"], @@ -2026,7 +2027,7 @@ bbh_sports_understanding_harness = LightevalTaskConfig( name="bbh:sports_understanding", suite=["harness"], - prompt_function="bbh_sports_understanding", + prompt_function=prompt.bbh_sports_understanding, hf_repo="lukaemon/bbh", hf_subset="sports_understanding", hf_avail_splits=["test"], @@ -2050,7 +2051,7 @@ bbh_temporal_sequences_harness = LightevalTaskConfig( name="bbh:temporal_sequences", suite=["harness"], - prompt_function="bbh_temporal_sequences", + prompt_function=prompt.bbh_temporal_sequences, hf_repo="lukaemon/bbh", hf_subset="temporal_sequences", hf_avail_splits=["test"], @@ -2074,7 +2075,7 @@ bbh_tracking_shuffled_objects_five_objects_harness = LightevalTaskConfig( name="bbh:tracking_shuffled_objects_five_objects", suite=["harness"], - prompt_function="bbh_tracking_shuffled_objects_five_objects", + prompt_function=prompt.bbh_tracking_shuffled_objects_five_objects, hf_repo="lukaemon/bbh", hf_subset="tracking_shuffled_objects_five_objects", hf_avail_splits=["test"], @@ -2098,7 +2099,7 @@ bbh_tracking_shuffled_objects_seven_objects_harness = LightevalTaskConfig( name="bbh:tracking_shuffled_objects_seven_objects", suite=["harness"], - prompt_function="bbh_tracking_shuffled_objects_seven_objects", + prompt_function=prompt.bbh_tracking_shuffled_objects_seven_objects, hf_repo="lukaemon/bbh", hf_subset="tracking_shuffled_objects_seven_objects", hf_avail_splits=["test"], @@ -2122,7 +2123,7 @@ bbh_tracking_shuffled_objects_three_objects_harness = LightevalTaskConfig( name="bbh:tracking_shuffled_objects_three_objects", suite=["harness"], - prompt_function="bbh_tracking_shuffled_objects_three_objects", + prompt_function=prompt.bbh_tracking_shuffled_objects_three_objects, hf_repo="lukaemon/bbh", hf_subset="tracking_shuffled_objects_three_objects", hf_avail_splits=["test"], @@ -2146,7 +2147,7 @@ bbh_web_of_lies_harness = LightevalTaskConfig( name="bbh:web_of_lies", suite=["harness"], - prompt_function="bbh_web_of_lies", + prompt_function=prompt.bbh_web_of_lies, hf_repo="lukaemon/bbh", hf_subset="web_of_lies", hf_avail_splits=["test"], @@ -2170,7 +2171,7 @@ bbh_word_sorting_harness = LightevalTaskConfig( name="bbh:word_sorting", suite=["harness"], - prompt_function="bbh_word_sorting", + prompt_function=prompt.bbh_word_sorting, hf_repo="lukaemon/bbh", hf_subset="word_sorting", hf_avail_splits=["test"], @@ -2194,7 +2195,7 @@ bbq_helm = LightevalTaskConfig( name="bbq", suite=["helm"], - prompt_function="bbq", + prompt_function=prompt.bbq, hf_repo="lighteval/bbq_helm", hf_subset="all", hf_avail_splits=["train", "test"], @@ -2218,7 +2219,7 @@ bbq_Age_helm = LightevalTaskConfig( name="bbq:Age", suite=["helm"], - prompt_function="bbq", + prompt_function=prompt.bbq, hf_repo="lighteval/bbq_helm", hf_subset="Age", hf_avail_splits=["train", "test"], @@ -2242,7 +2243,7 @@ bbq_Disability_status_helm = LightevalTaskConfig( name="bbq:Disability_status", suite=["helm"], - prompt_function="bbq", + prompt_function=prompt.bbq, hf_repo="lighteval/bbq_helm", hf_subset="Disability_status", hf_avail_splits=["train", "test"], @@ -2266,7 +2267,7 @@ bbq_Gender_identity_helm = LightevalTaskConfig( name="bbq:Gender_identity", suite=["helm"], - prompt_function="bbq", + prompt_function=prompt.bbq, hf_repo="lighteval/bbq_helm", hf_subset="Gender_identity", hf_avail_splits=["train", "test"], @@ -2290,7 +2291,7 @@ bbq_Nationality_helm = LightevalTaskConfig( name="bbq=Nationality", suite=["helm"], - prompt_function="bbq", + prompt_function=prompt.bbq, hf_repo="lighteval/bbq_helm", hf_subset="Nationality", hf_avail_splits=["train", "test"], @@ -2314,7 +2315,7 @@ bbq_Physical_appearance_helm = LightevalTaskConfig( name="bbq:Physical_appearance", suite=["helm"], - prompt_function="bbq", + prompt_function=prompt.bbq, hf_repo="lighteval/bbq_helm", hf_subset="Physical_appearance", hf_avail_splits=["train", "test"], @@ -2338,7 +2339,7 @@ bbq_Race_ethnicity_helm = LightevalTaskConfig( name="bbq:Race_ethnicity", suite=["helm"], - prompt_function="bbq", + prompt_function=prompt.bbq, hf_repo="lighteval/bbq_helm", hf_subset="Race_ethnicity", hf_avail_splits=["train", "test"], @@ -2362,7 +2363,7 @@ bbq_Race_x_SES_helm = LightevalTaskConfig( name="bbq:Race_x_SES", suite=["helm"], - prompt_function="bbq", + prompt_function=prompt.bbq, hf_repo="lighteval/bbq_helm", hf_subset="Race_x_SES", hf_avail_splits=["train", "test"], @@ -2386,7 +2387,7 @@ bbq_Race_x_gender_helm = LightevalTaskConfig( name="bbq:Race_x_gender", suite=["helm"], - prompt_function="bbq", + prompt_function=prompt.bbq, hf_repo="lighteval/bbq_helm", hf_subset="Race_x_gender", hf_avail_splits=["train", "test"], @@ -2410,7 +2411,7 @@ bbq_Religion_helm = LightevalTaskConfig( name="bbq:Religion", suite=["helm"], - prompt_function="bbq", + prompt_function=prompt.bbq, hf_repo="lighteval/bbq_helm", hf_subset="Religion", hf_avail_splits=["train", "test"], @@ -2434,7 +2435,7 @@ bbq_SES_helm = LightevalTaskConfig( name="bbq:SES", suite=["helm"], - prompt_function="bbq", + prompt_function=prompt.bbq, hf_repo="lighteval/bbq_helm", hf_subset="SES", hf_avail_splits=["train", "test"], @@ -2458,7 +2459,7 @@ bbq_Sexual_orientation_helm = LightevalTaskConfig( name="bbq:Sexual_orientation", suite=["helm"], - prompt_function="bbq", + prompt_function=prompt.bbq, hf_repo="lighteval/bbq_helm", hf_subset="Sexual_orientation", hf_avail_splits=["train", "test"], @@ -2482,7 +2483,7 @@ bbq_lite_json_bigbench_lite = LightevalTaskConfig( name="bbq_lite_json", suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function="bigbench_linefeed_before_whitespace_after_query", + prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query, hf_repo="bigbench", hf_subset="bbq_lite_json", hf_avail_splits=["default", "train", "validation"], @@ -2500,7 +2501,7 @@ bigbench_auto_debugging_helm = LightevalTaskConfig( name="bigbench:auto_debugging", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="auto_debugging", hf_avail_splits=["train", "test", "validation"], @@ -2518,7 +2519,7 @@ bigbench_bbq_lite_json_age_ambig_helm = LightevalTaskConfig( name="bigbench:bbq_lite_json:age_ambig", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="bbq_lite_json-age_ambig", hf_avail_splits=["train", "test", "validation"], @@ -2536,7 +2537,7 @@ bigbench_bbq_lite_json_age_disambig_helm = LightevalTaskConfig( name="bigbench:bbq_lite_json:age_disambig", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="bbq_lite_json-age_disambig", hf_avail_splits=["train", "test", "validation"], @@ -2554,7 +2555,7 @@ bigbench_bbq_lite_json_disability_status_ambig_helm = LightevalTaskConfig( name="bigbench:bbq_lite_json:disability_status_ambig", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="bbq_lite_json-disability_status_ambig", hf_avail_splits=["train", "test", "validation"], @@ -2572,7 +2573,7 @@ bigbench_bbq_lite_json_disability_status_disambig_helm = LightevalTaskConfig( name="bigbench:bbq_lite_json:disability_status_disambig", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="bbq_lite_json-disability_status_disambig", hf_avail_splits=["train", "test", "validation"], @@ -2590,7 +2591,7 @@ bigbench_bbq_lite_json_gender_identity_ambig_helm = LightevalTaskConfig( name="bigbench:bbq_lite_json:gender_identity_ambig", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="bbq_lite_json-gender_identity_ambig", hf_avail_splits=["train", "test", "validation"], @@ -2608,7 +2609,7 @@ bigbench_bbq_lite_json_gender_identity_disambig_helm = LightevalTaskConfig( name="bigbench:bbq_lite_json:gender_identity_disambig", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="bbq_lite_json-gender_identity_disambig", hf_avail_splits=["train", "test", "validation"], @@ -2626,7 +2627,7 @@ bigbench_bbq_lite_json_nationality_ambig_helm = LightevalTaskConfig( name="bigbench:bbq_lite_json:nationality_ambig", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="bbq_lite_json-nationality_ambig", hf_avail_splits=["train", "test", "validation"], @@ -2644,7 +2645,7 @@ bigbench_bbq_lite_json_nationality_disambig_helm = LightevalTaskConfig( name="bigbench:bbq_lite_json:nationality_disambig", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="bbq_lite_json-nationality_disambig", hf_avail_splits=["train", "test", "validation"], @@ -2662,7 +2663,7 @@ bigbench_bbq_lite_json_physical_appearance_ambig_helm = LightevalTaskConfig( name="bigbench:bbq_lite_json:physical_appearance_ambig", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="bbq_lite_json-physical_appearance_ambig", hf_avail_splits=["train", "test", "validation"], @@ -2680,7 +2681,7 @@ bigbench_bbq_lite_json_physical_appearance_disambig_helm = LightevalTaskConfig( name="bigbench:bbq_lite_json:physical_appearance_disambig", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="bbq_lite_json-physical_appearance_disambig", hf_avail_splits=["train", "test", "validation"], @@ -2698,7 +2699,7 @@ bigbench_bbq_lite_json_race_ethnicity_ambig_helm = LightevalTaskConfig( name="bigbench:bbq_lite_json:race_ethnicity_ambig", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="bbq_lite_json-race_ethnicity_ambig", hf_avail_splits=["train", "test", "validation"], @@ -2716,7 +2717,7 @@ bigbench_bbq_lite_json_race_ethnicity_disambig_helm = LightevalTaskConfig( name="bigbench:bbq_lite_json:race_ethnicity_disambig", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="bbq_lite_json-race_ethnicity_disambig", hf_avail_splits=["train", "test", "validation"], @@ -2734,7 +2735,7 @@ bigbench_bbq_lite_json_religion_ambig_helm = LightevalTaskConfig( name="bigbench:bbq_lite_json:religion_ambig", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="bbq_lite_json-religion_ambig", hf_avail_splits=["train", "test", "validation"], @@ -2752,7 +2753,7 @@ bigbench_bbq_lite_json_religion_disambig_helm = LightevalTaskConfig( name="bigbench:bbq_lite_json:religion_disambig", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="bbq_lite_json-religion_disambig", hf_avail_splits=["train", "test", "validation"], @@ -2770,7 +2771,7 @@ bigbench_bbq_lite_json_ses_ambig_helm = LightevalTaskConfig( name="bigbench:bbq_lite_json:ses_ambig", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="bbq_lite_json-ses_ambig", hf_avail_splits=["train", "test", "validation"], @@ -2788,7 +2789,7 @@ bigbench_bbq_lite_json_ses_disambig_helm = LightevalTaskConfig( name="bigbench:bbq_lite_json:ses_disambig", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="bbq_lite_json-ses_disambig", hf_avail_splits=["train", "test", "validation"], @@ -2806,7 +2807,7 @@ bigbench_bbq_lite_json_sexual_orientation_ambig_helm = LightevalTaskConfig( name="bigbench:bbq_lite_json:sexual_orientation_ambig", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="bbq_lite_json-sexual_orientation_ambig", hf_avail_splits=["train", "test", "validation"], @@ -2824,7 +2825,7 @@ bigbench_bbq_lite_json_sexual_orientation_disambig_helm = LightevalTaskConfig( name="bigbench:bbq_lite_json:sexual_orientation_disambig", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="bbq_lite_json-sexual_orientation_disambig", hf_avail_splits=["train", "test", "validation"], @@ -2842,7 +2843,7 @@ bigbench_code_line_description_helm = LightevalTaskConfig( name="bigbench:code_line_description", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="code_line_description", hf_avail_splits=["train", "test", "validation"], @@ -2860,7 +2861,7 @@ bigbench_conceptual_combinations_contradictions_helm = LightevalTaskConfig( name="bigbench:conceptual_combinations:contradictions", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="conceptual_combinations-contradictions", hf_avail_splits=["train", "test", "validation"], @@ -2878,7 +2879,7 @@ bigbench_conceptual_combinations_emergent_properties_helm = LightevalTaskConfig( name="bigbench:conceptual_combinations:emergent_properties", suite=["helm"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="conceptual_combinations-emergent_properties", hf_avail_splits=["train", "test", "validation"], @@ -2896,7 +2897,7 @@ bigbench_conceptual_combinations_fanciful_fictional_combinations_helm = LightevalTaskConfig( name="bigbench:conceptual_combinations:fanciful_fictional_combinations", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="conceptual_combinations-fanciful_fictional_combinations", hf_avail_splits=["train", "test", "validation"], @@ -2914,7 +2915,7 @@ bigbench_conceptual_combinations_homonyms_helm = LightevalTaskConfig( name="bigbench:conceptual_combinations:homonyms", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="conceptual_combinations-homonyms", hf_avail_splits=["train", "test", "validation"], @@ -2932,7 +2933,7 @@ bigbench_conceptual_combinations_invented_words_helm = LightevalTaskConfig( name="bigbench:conceptual_combinations:invented_words", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="conceptual_combinations-invented_words", hf_avail_splits=["train", "test", "validation"], @@ -2950,7 +2951,7 @@ bigbench_conlang_translation_adna_from_helm = LightevalTaskConfig( name="bigbench:conlang_translation:adna_from", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="conlang_translation-adna_from", hf_avail_splits=["train", "test", "validation"], @@ -2968,7 +2969,7 @@ bigbench_conlang_translation_adna_to_helm = LightevalTaskConfig( name="bigbench:conlang_translation:adna_to", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="conlang_translation-adna_to", hf_avail_splits=["train", "test", "validation"], @@ -2986,7 +2987,7 @@ bigbench_conlang_translation_atikampe_from_helm = LightevalTaskConfig( name="bigbench:conlang_translation:atikampe_from", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="conlang_translation-atikampe_from", hf_avail_splits=["train", "test", "validation"], @@ -3004,7 +3005,7 @@ bigbench_conlang_translation_atikampe_to_helm = LightevalTaskConfig( name="bigbench:conlang_translation:atikampe_to", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="conlang_translation-atikampe_to", hf_avail_splits=["train", "test", "validation"], @@ -3022,7 +3023,7 @@ bigbench_conlang_translation_gornam_from_helm = LightevalTaskConfig( name="bigbench:conlang_translation:gornam_from", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="conlang_translation-gornam_from", hf_avail_splits=["train", "test", "validation"], @@ -3040,7 +3041,7 @@ bigbench_conlang_translation_gornam_to_helm = LightevalTaskConfig( name="bigbench:conlang_translation:gornam_to", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="conlang_translation-gornam_to", hf_avail_splits=["train", "test", "validation"], @@ -3058,7 +3059,7 @@ bigbench_conlang_translation_holuan_from_helm = LightevalTaskConfig( name="bigbench:conlang_translation:holuan_from", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="conlang_translation-holuan_from", hf_avail_splits=["train", "test", "validation"], @@ -3076,7 +3077,7 @@ bigbench_conlang_translation_holuan_to_helm = LightevalTaskConfig( name="bigbench:conlang_translation:holuan_to", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="conlang_translation-holuan_to", hf_avail_splits=["train", "test", "validation"], @@ -3094,7 +3095,7 @@ bigbench_conlang_translation_mkafala_from_helm = LightevalTaskConfig( name="bigbench:conlang_translation:mkafala_from", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="conlang_translation-mkafala_from", hf_avail_splits=["train", "test", "validation"], @@ -3112,7 +3113,7 @@ bigbench_conlang_translation_mkafala_to_helm = LightevalTaskConfig( name="bigbench:conlang_translation:mkafala_to", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="conlang_translation-mkafala_to", hf_avail_splits=["train", "test", "validation"], @@ -3130,7 +3131,7 @@ bigbench_conlang_translation_postpositive_english_from_helm = LightevalTaskConfig( name="bigbench:conlang_translation:postpositive_english_from", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="conlang_translation-postpositive_english_from", hf_avail_splits=["train", "test", "validation"], @@ -3148,7 +3149,7 @@ bigbench_conlang_translation_postpositive_english_to_helm = LightevalTaskConfig( name="bigbench:conlang_translation:postpositive_english_to", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="conlang_translation-postpositive_english_to", hf_avail_splits=["train", "test", "validation"], @@ -3166,7 +3167,7 @@ bigbench_conlang_translation_unapuri_from_helm = LightevalTaskConfig( name="bigbench:conlang_translation:unapuri_from", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="conlang_translation-unapuri_from", hf_avail_splits=["train", "test", "validation"], @@ -3184,7 +3185,7 @@ bigbench_conlang_translation_unapuri_to_helm = LightevalTaskConfig( name="bigbench:conlang_translation:unapuri_to", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="conlang_translation-unapuri_to", hf_avail_splits=["train", "test", "validation"], @@ -3202,7 +3203,7 @@ bigbench_conlang_translation_vaomi_from_helm = LightevalTaskConfig( name="bigbench:conlang_translation:vaomi_from", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="conlang_translation-vaomi_from", hf_avail_splits=["train", "test", "validation"], @@ -3220,7 +3221,7 @@ bigbench_conlang_translation_vaomi_to_helm = LightevalTaskConfig( name="bigbench:conlang_translation:vaomi_to", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="conlang_translation-vaomi_to", hf_avail_splits=["train", "test", "validation"], @@ -3238,7 +3239,7 @@ bigbench_emoji_movie_helm = LightevalTaskConfig( name="bigbench:emoji_movie", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="emoji_movie", hf_avail_splits=["train", "test", "validation"], @@ -3256,7 +3257,7 @@ bigbench_formal_fallacies_syllogisms_negation_helm = LightevalTaskConfig( name="bigbench:formal_fallacies_syllogisms_negation", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="formal_fallacies_syllogisms_negation", hf_avail_splits=["train", "test", "validation"], @@ -3274,7 +3275,7 @@ bigbench_hindu_knowledge_helm = LightevalTaskConfig( name="bigbench:hindu_knowledge", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="hindu_knowledge", hf_avail_splits=["train", "test", "validation"], @@ -3292,7 +3293,7 @@ bigbench_known_unknowns_helm = LightevalTaskConfig( name="bigbench:known_unknowns", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="known_unknowns", hf_avail_splits=["train", "test", "validation"], @@ -3310,7 +3311,7 @@ bigbench_language_identification_helm = LightevalTaskConfig( name="bigbench:language_identification", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="language_identification", hf_avail_splits=["train", "test", "validation"], @@ -3328,7 +3329,7 @@ bigbench_linguistics_puzzles_helm = LightevalTaskConfig( name="bigbench:linguistics_puzzles", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="linguistics_puzzles", hf_avail_splits=["train", "test", "validation"], @@ -3346,7 +3347,7 @@ bigbench_logic_grid_puzzle_helm = LightevalTaskConfig( name="bigbench:logic_grid_puzzle", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="logic_grid_puzzle", hf_avail_splits=["train", "test", "validation"], @@ -3364,7 +3365,7 @@ bigbench_logical_deduction_five_objects_helm = LightevalTaskConfig( name="bigbench:logical_deduction-five_objects", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="logical_deduction-five_objects", hf_avail_splits=["train", "test", "validation"], @@ -3382,7 +3383,7 @@ bigbench_logical_deduction_seven_objects_helm = LightevalTaskConfig( name="bigbench:logical_deduction-seven_objects", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="logical_deduction-seven_objects", hf_avail_splits=["train", "test", "validation"], @@ -3400,7 +3401,7 @@ bigbench_logical_deduction_three_objects_helm = LightevalTaskConfig( name="bigbench:logical_deduction-three_objects", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="logical_deduction-three_objects", hf_avail_splits=["train", "test", "validation"], @@ -3418,7 +3419,7 @@ bigbench_misconceptions_russian_helm = LightevalTaskConfig( name="bigbench:misconceptions_russian", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="misconceptions_russian", hf_avail_splits=["train", "test", "validation"], @@ -3436,7 +3437,7 @@ bigbench_novel_concepts_helm = LightevalTaskConfig( name="bigbench:novel_concepts", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="novel_concepts", hf_avail_splits=["train", "test", "validation"], @@ -3454,7 +3455,7 @@ bigbench_operators_helm = LightevalTaskConfig( name="bigbench:operators", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="operators", hf_avail_splits=["train", "test", "validation"], @@ -3472,7 +3473,7 @@ bigbench_parsinlu_reading_comprehension_helm = LightevalTaskConfig( name="bigbench:parsinlu_reading_comprehension", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="parsinlu_reading_comprehension", hf_avail_splits=["train", "test", "validation"], @@ -3490,7 +3491,7 @@ bigbench_play_dialog_same_or_different_helm = LightevalTaskConfig( name="bigbench:play_dialog_same_or_different", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="play_dialog_same_or_different", hf_avail_splits=["train", "test", "validation"], @@ -3508,7 +3509,7 @@ bigbench_repeat_copy_logic_helm = LightevalTaskConfig( name="bigbench:repeat_copy_logic", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="repeat_copy_logic", hf_avail_splits=["train", "test", "validation"], @@ -3526,7 +3527,7 @@ bigbench_strange_stories_boolean_helm = LightevalTaskConfig( name="bigbench:strange_stories-boolean", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="strange_stories-boolean", hf_avail_splits=["train", "test", "validation"], @@ -3544,7 +3545,7 @@ bigbench_strange_stories_multiple_choice_helm = LightevalTaskConfig( name="bigbench:strange_stories-multiple_choice", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="strange_stories-multiple_choice", hf_avail_splits=["train", "test", "validation"], @@ -3562,7 +3563,7 @@ bigbench_strategyqa_helm = LightevalTaskConfig( name="bigbench:strategyqa", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="strategyqa", hf_avail_splits=["train", "test", "validation"], @@ -3580,7 +3581,7 @@ bigbench_symbol_interpretation_adversarial_helm = LightevalTaskConfig( name="bigbench:symbol_interpretation-adversarial", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="symbol_interpretation-adversarial", hf_avail_splits=["train", "test", "validation"], @@ -3598,7 +3599,7 @@ bigbench_symbol_interpretation_emoji_agnostic_helm = LightevalTaskConfig( name="bigbench:symbol_interpretation-emoji_agnostic", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="symbol_interpretation-emoji_agnostic", hf_avail_splits=["train", "test", "validation"], @@ -3616,7 +3617,7 @@ bigbench_symbol_interpretation_name_agnostic_helm = LightevalTaskConfig( name="bigbench:symbol_interpretation-name_agnostic", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="symbol_interpretation-name_agnostic", hf_avail_splits=["train", "test", "validation"], @@ -3634,7 +3635,7 @@ bigbench_symbol_interpretation_plain_helm = LightevalTaskConfig( name="bigbench:symbol_interpretation-plain", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="symbol_interpretation-plain", hf_avail_splits=["train", "test", "validation"], @@ -3652,7 +3653,7 @@ bigbench_symbol_interpretation_tricky_helm = LightevalTaskConfig( name="bigbench:symbol_interpretation-tricky", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="symbol_interpretation-tricky", hf_avail_splits=["train", "test", "validation"], @@ -3670,7 +3671,7 @@ bigbench_vitaminc_fact_verification_helm = LightevalTaskConfig( name="bigbench:vitaminc_fact_verification", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="vitaminc_fact_verification", hf_avail_splits=["train", "test", "validation"], @@ -3688,7 +3689,7 @@ bigbench_winowhy_helm = LightevalTaskConfig( name="bigbench:winowhy", suite=["helm", "bigbench_scenario"], - prompt_function="bigbench_helm", + prompt_function=prompt.bigbench_helm, hf_repo="lighteval/bigbench_helm", hf_subset="winowhy", hf_avail_splits=["train", "test", "validation"], @@ -3706,7 +3707,7 @@ blimp_adjunct_island_lighteval = LightevalTaskConfig( name="blimp:adjunct_island", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="adjunct_island", hf_avail_splits=["train"], @@ -3724,7 +3725,7 @@ blimp_adjunct_island_helm = LightevalTaskConfig( name="blimp:adjunct_island", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="adjunct_island", hf_avail_splits=["train"], @@ -3742,7 +3743,7 @@ blimp_anaphor_gender_agreement_lighteval = LightevalTaskConfig( name="blimp:anaphor_gender_agreement", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="anaphor_gender_agreement", hf_avail_splits=["train"], @@ -3760,7 +3761,7 @@ blimp_anaphor_gender_agreement_helm = LightevalTaskConfig( name="blimp:anaphor_gender_agreement", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="anaphor_gender_agreement", hf_avail_splits=["train"], @@ -3778,7 +3779,7 @@ blimp_anaphor_number_agreement_lighteval = LightevalTaskConfig( name="blimp:anaphor_number_agreement", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="anaphor_number_agreement", hf_avail_splits=["train"], @@ -3796,7 +3797,7 @@ blimp_anaphor_number_agreement_helm = LightevalTaskConfig( name="blimp:anaphor_number_agreement", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="anaphor_number_agreement", hf_avail_splits=["train"], @@ -3814,7 +3815,7 @@ blimp_animate_subject_passive_lighteval = LightevalTaskConfig( name="blimp:animate_subject_passive", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="animate_subject_passive", hf_avail_splits=["train"], @@ -3832,7 +3833,7 @@ blimp_animate_subject_passive_helm = LightevalTaskConfig( name="blimp:animate_subject_passive", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="animate_subject_passive", hf_avail_splits=["train"], @@ -3850,7 +3851,7 @@ blimp_animate_subject_trans_lighteval = LightevalTaskConfig( name="blimp:animate_subject_trans", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="animate_subject_trans", hf_avail_splits=["train"], @@ -3868,7 +3869,7 @@ blimp_animate_subject_trans_helm = LightevalTaskConfig( name="blimp:animate_subject_trans", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="animate_subject_trans", hf_avail_splits=["train"], @@ -3886,7 +3887,7 @@ blimp_causative_lighteval = LightevalTaskConfig( name="blimp:causative", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="causative", hf_avail_splits=["train"], @@ -3904,7 +3905,7 @@ blimp_causative_helm = LightevalTaskConfig( name="blimp:causative", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="causative", hf_avail_splits=["train"], @@ -3922,7 +3923,7 @@ blimp_complex_NP_island_lighteval = LightevalTaskConfig( name="blimp:complex_NP_island", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="complex_NP_island", hf_avail_splits=["train"], @@ -3940,7 +3941,7 @@ blimp_complex_NP_island_helm = LightevalTaskConfig( name="blimp:complex_NP_island", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="complex_NP_island", hf_avail_splits=["train"], @@ -3958,7 +3959,7 @@ blimp_coordinate_structure_constraint_complex_left_branch_lighteval = LightevalTaskConfig( name="blimp:coordinate_structure_constraint_complex_left_branch", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="coordinate_structure_constraint_complex_left_branch", hf_avail_splits=["train"], @@ -3976,7 +3977,7 @@ blimp_coordinate_structure_constraint_complex_left_branch_helm = LightevalTaskConfig( name="blimp:coordinate_structure_constraint_complex_left_branch", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="coordinate_structure_constraint_complex_left_branch", hf_avail_splits=["train"], @@ -3994,7 +3995,7 @@ blimp_coordinate_structure_constraint_object_extraction_lighteval = LightevalTaskConfig( name="blimp:coordinate_structure_constraint_object_extraction", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="coordinate_structure_constraint_object_extraction", hf_avail_splits=["train"], @@ -4012,7 +4013,7 @@ blimp_coordinate_structure_constraint_object_extraction_helm = LightevalTaskConfig( name="blimp:coordinate_structure_constraint_object_extraction", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="coordinate_structure_constraint_object_extraction", hf_avail_splits=["train"], @@ -4030,7 +4031,7 @@ blimp_determiner_noun_agreement_1_lighteval = LightevalTaskConfig( name="blimp:determiner_noun_agreement_1", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="determiner_noun_agreement_1", hf_avail_splits=["train"], @@ -4048,7 +4049,7 @@ blimp_determiner_noun_agreement_1_helm = LightevalTaskConfig( name="blimp:determiner_noun_agreement_1", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="determiner_noun_agreement_1", hf_avail_splits=["train"], @@ -4066,7 +4067,7 @@ blimp_determiner_noun_agreement_2_lighteval = LightevalTaskConfig( name="blimp:determiner_noun_agreement_2", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="determiner_noun_agreement_2", hf_avail_splits=["train"], @@ -4084,7 +4085,7 @@ blimp_determiner_noun_agreement_2_helm = LightevalTaskConfig( name="blimp:determiner_noun_agreement_2", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="determiner_noun_agreement_2", hf_avail_splits=["train"], @@ -4102,7 +4103,7 @@ blimp_determiner_noun_agreement_irregular_1_lighteval = LightevalTaskConfig( name="blimp:determiner_noun_agreement_irregular_1", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="determiner_noun_agreement_irregular_1", hf_avail_splits=["train"], @@ -4120,7 +4121,7 @@ blimp_determiner_noun_agreement_irregular_1_helm = LightevalTaskConfig( name="blimp:determiner_noun_agreement_irregular_1", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="determiner_noun_agreement_irregular_1", hf_avail_splits=["train"], @@ -4138,7 +4139,7 @@ blimp_determiner_noun_agreement_irregular_2_lighteval = LightevalTaskConfig( name="blimp:determiner_noun_agreement_irregular_2", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="determiner_noun_agreement_irregular_2", hf_avail_splits=["train"], @@ -4156,7 +4157,7 @@ blimp_determiner_noun_agreement_irregular_2_helm = LightevalTaskConfig( name="blimp:determiner_noun_agreement_irregular_2", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="determiner_noun_agreement_irregular_2", hf_avail_splits=["train"], @@ -4174,7 +4175,7 @@ blimp_determiner_noun_agreement_with_adj_2_lighteval = LightevalTaskConfig( name="blimp:determiner_noun_agreement_with_adj_2", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="determiner_noun_agreement_with_adj_2", hf_avail_splits=["train"], @@ -4192,7 +4193,7 @@ blimp_determiner_noun_agreement_with_adj_2_helm = LightevalTaskConfig( name="blimp:determiner_noun_agreement_with_adj_2", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="determiner_noun_agreement_with_adj_2", hf_avail_splits=["train"], @@ -4210,7 +4211,7 @@ blimp_determiner_noun_agreement_with_adj_irregular_1_lighteval = LightevalTaskConfig( name="blimp:determiner_noun_agreement_with_adj_irregular_1", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="determiner_noun_agreement_with_adj_irregular_1", hf_avail_splits=["train"], @@ -4228,7 +4229,7 @@ blimp_determiner_noun_agreement_with_adj_irregular_1_helm = LightevalTaskConfig( name="blimp:determiner_noun_agreement_with_adj_irregular_1", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="determiner_noun_agreement_with_adj_irregular_1", hf_avail_splits=["train"], @@ -4246,7 +4247,7 @@ blimp_determiner_noun_agreement_with_adj_irregular_2_lighteval = LightevalTaskConfig( name="blimp:determiner_noun_agreement_with_adj_irregular_2", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="determiner_noun_agreement_with_adj_irregular_2", hf_avail_splits=["train"], @@ -4264,7 +4265,7 @@ blimp_determiner_noun_agreement_with_adj_irregular_2_helm = LightevalTaskConfig( name="blimp:determiner_noun_agreement_with_adj_irregular_2", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="determiner_noun_agreement_with_adj_irregular_2", hf_avail_splits=["train"], @@ -4282,7 +4283,7 @@ blimp_determiner_noun_agreement_with_adjective_1_lighteval = LightevalTaskConfig( name="blimp:determiner_noun_agreement_with_adjective_1", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="determiner_noun_agreement_with_adjective_1", hf_avail_splits=["train"], @@ -4300,7 +4301,7 @@ blimp_determiner_noun_agreement_with_adjective_1_helm = LightevalTaskConfig( name="blimp:determiner_noun_agreement_with_adjective_1", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="determiner_noun_agreement_with_adjective_1", hf_avail_splits=["train"], @@ -4318,7 +4319,7 @@ blimp_distractor_agreement_relational_noun_lighteval = LightevalTaskConfig( name="blimp:distractor_agreement_relational_noun", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="distractor_agreement_relational_noun", hf_avail_splits=["train"], @@ -4336,7 +4337,7 @@ blimp_distractor_agreement_relational_noun_helm = LightevalTaskConfig( name="blimp:distractor_agreement_relational_noun", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="distractor_agreement_relational_noun", hf_avail_splits=["train"], @@ -4354,7 +4355,7 @@ blimp_distractor_agreement_relative_clause_lighteval = LightevalTaskConfig( name="blimp:distractor_agreement_relative_clause", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="distractor_agreement_relative_clause", hf_avail_splits=["train"], @@ -4372,7 +4373,7 @@ blimp_distractor_agreement_relative_clause_helm = LightevalTaskConfig( name="blimp:distractor_agreement_relative_clause", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="distractor_agreement_relative_clause", hf_avail_splits=["train"], @@ -4390,7 +4391,7 @@ blimp_drop_argument_lighteval = LightevalTaskConfig( name="blimp:drop_argument", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="drop_argument", hf_avail_splits=["train"], @@ -4408,7 +4409,7 @@ blimp_drop_argument_helm = LightevalTaskConfig( name="blimp:drop_argument", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="drop_argument", hf_avail_splits=["train"], @@ -4426,7 +4427,7 @@ blimp_ellipsis_n_bar_1_lighteval = LightevalTaskConfig( name="blimp:ellipsis_n_bar_1", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="ellipsis_n_bar_1", hf_avail_splits=["train"], @@ -4444,7 +4445,7 @@ blimp_ellipsis_n_bar_1_helm = LightevalTaskConfig( name="blimp:ellipsis_n_bar_1", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="ellipsis_n_bar_1", hf_avail_splits=["train"], @@ -4462,7 +4463,7 @@ blimp_ellipsis_n_bar_2_lighteval = LightevalTaskConfig( name="blimp:ellipsis_n_bar_2", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="ellipsis_n_bar_2", hf_avail_splits=["train"], @@ -4480,7 +4481,7 @@ blimp_ellipsis_n_bar_2_helm = LightevalTaskConfig( name="blimp:ellipsis_n_bar_2", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="ellipsis_n_bar_2", hf_avail_splits=["train"], @@ -4498,7 +4499,7 @@ blimp_existential_there_object_raising_lighteval = LightevalTaskConfig( name="blimp:existential_there_object_raising", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="existential_there_object_raising", hf_avail_splits=["train"], @@ -4516,7 +4517,7 @@ blimp_existential_there_object_raising_helm = LightevalTaskConfig( name="blimp:existential_there_object_raising", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="existential_there_object_raising", hf_avail_splits=["train"], @@ -4534,7 +4535,7 @@ blimp_existential_there_quantifiers_1_lighteval = LightevalTaskConfig( name="blimp:existential_there_quantifiers_1", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="existential_there_quantifiers_1", hf_avail_splits=["train"], @@ -4552,7 +4553,7 @@ blimp_existential_there_quantifiers_1_helm = LightevalTaskConfig( name="blimp:existential_there_quantifiers_1", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="existential_there_quantifiers_1", hf_avail_splits=["train"], @@ -4570,7 +4571,7 @@ blimp_existential_there_quantifiers_2_lighteval = LightevalTaskConfig( name="blimp:existential_there_quantifiers_2", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="existential_there_quantifiers_2", hf_avail_splits=["train"], @@ -4588,7 +4589,7 @@ blimp_existential_there_quantifiers_2_helm = LightevalTaskConfig( name="blimp:existential_there_quantifiers_2", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="existential_there_quantifiers_2", hf_avail_splits=["train"], @@ -4606,7 +4607,7 @@ blimp_existential_there_subject_raising_lighteval = LightevalTaskConfig( name="blimp:existential_there_subject_raising", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="existential_there_subject_raising", hf_avail_splits=["train"], @@ -4624,7 +4625,7 @@ blimp_existential_there_subject_raising_helm = LightevalTaskConfig( name="blimp:existential_there_subject_raising", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="existential_there_subject_raising", hf_avail_splits=["train"], @@ -4642,7 +4643,7 @@ blimp_expletive_it_object_raising_lighteval = LightevalTaskConfig( name="blimp:expletive_it_object_raising", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="expletive_it_object_raising", hf_avail_splits=["train"], @@ -4660,7 +4661,7 @@ blimp_expletive_it_object_raising_helm = LightevalTaskConfig( name="blimp:expletive_it_object_raising", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="expletive_it_object_raising", hf_avail_splits=["train"], @@ -4678,7 +4679,7 @@ blimp_inchoative_lighteval = LightevalTaskConfig( name="blimp:inchoative", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="inchoative", hf_avail_splits=["train"], @@ -4696,7 +4697,7 @@ blimp_inchoative_helm = LightevalTaskConfig( name="blimp:inchoative", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="inchoative", hf_avail_splits=["train"], @@ -4714,7 +4715,7 @@ blimp_intransitive_lighteval = LightevalTaskConfig( name="blimp:intransitive", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="intransitive", hf_avail_splits=["train"], @@ -4732,7 +4733,7 @@ blimp_intransitive_helm = LightevalTaskConfig( name="blimp:intransitive", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="intransitive", hf_avail_splits=["train"], @@ -4750,7 +4751,7 @@ blimp_irregular_past_participle_adjectives_lighteval = LightevalTaskConfig( name="blimp:irregular_past_participle_adjectives", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="irregular_past_participle_adjectives", hf_avail_splits=["train"], @@ -4768,7 +4769,7 @@ blimp_irregular_past_participle_adjectives_helm = LightevalTaskConfig( name="blimp:irregular_past_participle_adjectives", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="irregular_past_participle_adjectives", hf_avail_splits=["train"], @@ -4786,7 +4787,7 @@ blimp_irregular_past_participle_verbs_lighteval = LightevalTaskConfig( name="blimp:irregular_past_participle_verbs", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="irregular_past_participle_verbs", hf_avail_splits=["train"], @@ -4804,7 +4805,7 @@ blimp_irregular_past_participle_verbs_helm = LightevalTaskConfig( name="blimp:irregular_past_participle_verbs", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="irregular_past_participle_verbs", hf_avail_splits=["train"], @@ -4822,7 +4823,7 @@ blimp_irregular_plural_subject_verb_agreement_1_lighteval = LightevalTaskConfig( name="blimp:irregular_plural_subject_verb_agreement_1", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="irregular_plural_subject_verb_agreement_1", hf_avail_splits=["train"], @@ -4840,7 +4841,7 @@ blimp_irregular_plural_subject_verb_agreement_1_helm = LightevalTaskConfig( name="blimp:irregular_plural_subject_verb_agreement_1", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="irregular_plural_subject_verb_agreement_1", hf_avail_splits=["train"], @@ -4858,7 +4859,7 @@ blimp_irregular_plural_subject_verb_agreement_2_lighteval = LightevalTaskConfig( name="blimp:irregular_plural_subject_verb_agreement_2", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="irregular_plural_subject_verb_agreement_2", hf_avail_splits=["train"], @@ -4876,7 +4877,7 @@ blimp_irregular_plural_subject_verb_agreement_2_helm = LightevalTaskConfig( name="blimp:irregular_plural_subject_verb_agreement_2", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="irregular_plural_subject_verb_agreement_2", hf_avail_splits=["train"], @@ -4894,7 +4895,7 @@ blimp_left_branch_island_echo_question_lighteval = LightevalTaskConfig( name="blimp:left_branch_island_echo_question", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="left_branch_island_echo_question", hf_avail_splits=["train"], @@ -4912,7 +4913,7 @@ blimp_left_branch_island_echo_question_helm = LightevalTaskConfig( name="blimp:left_branch_island_echo_question", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="left_branch_island_echo_question", hf_avail_splits=["train"], @@ -4930,7 +4931,7 @@ blimp_left_branch_island_simple_question_lighteval = LightevalTaskConfig( name="blimp:left_branch_island_simple_question", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="left_branch_island_simple_question", hf_avail_splits=["train"], @@ -4948,7 +4949,7 @@ blimp_left_branch_island_simple_question_helm = LightevalTaskConfig( name="blimp:left_branch_island_simple_question", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="left_branch_island_simple_question", hf_avail_splits=["train"], @@ -4966,7 +4967,7 @@ blimp_matrix_question_npi_licensor_present_lighteval = LightevalTaskConfig( name="blimp:matrix_question_npi_licensor_present", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="matrix_question_npi_licensor_present", hf_avail_splits=["train"], @@ -4984,7 +4985,7 @@ blimp_matrix_question_npi_licensor_present_helm = LightevalTaskConfig( name="blimp:matrix_question_npi_licensor_present", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="matrix_question_npi_licensor_present", hf_avail_splits=["train"], @@ -5002,7 +5003,7 @@ blimp_npi_present_1_lighteval = LightevalTaskConfig( name="blimp:npi_present_1", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="npi_present_1", hf_avail_splits=["train"], @@ -5020,7 +5021,7 @@ blimp_npi_present_1_helm = LightevalTaskConfig( name="blimp:npi_present_1", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="npi_present_1", hf_avail_splits=["train"], @@ -5038,7 +5039,7 @@ blimp_npi_present_2_lighteval = LightevalTaskConfig( name="blimp:npi_present_2", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="npi_present_2", hf_avail_splits=["train"], @@ -5056,7 +5057,7 @@ blimp_npi_present_2_helm = LightevalTaskConfig( name="blimp:npi_present_2", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="npi_present_2", hf_avail_splits=["train"], @@ -5074,7 +5075,7 @@ blimp_only_npi_licensor_present_lighteval = LightevalTaskConfig( name="blimp:only_npi_licensor_present", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="only_npi_licensor_present", hf_avail_splits=["train"], @@ -5092,7 +5093,7 @@ blimp_only_npi_licensor_present_helm = LightevalTaskConfig( name="blimp:only_npi_licensor_present", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="only_npi_licensor_present", hf_avail_splits=["train"], @@ -5110,7 +5111,7 @@ blimp_only_npi_scope_lighteval = LightevalTaskConfig( name="blimp:only_npi_scope", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="only_npi_scope", hf_avail_splits=["train"], @@ -5128,7 +5129,7 @@ blimp_only_npi_scope_helm = LightevalTaskConfig( name="blimp:only_npi_scope", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="only_npi_scope", hf_avail_splits=["train"], @@ -5146,7 +5147,7 @@ blimp_passive_1_lighteval = LightevalTaskConfig( name="blimp:passive_1", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="passive_1", hf_avail_splits=["train"], @@ -5164,7 +5165,7 @@ blimp_passive_1_helm = LightevalTaskConfig( name="blimp:passive_1", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="passive_1", hf_avail_splits=["train"], @@ -5182,7 +5183,7 @@ blimp_passive_2_lighteval = LightevalTaskConfig( name="blimp:passive_2", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="passive_2", hf_avail_splits=["train"], @@ -5200,7 +5201,7 @@ blimp_passive_2_helm = LightevalTaskConfig( name="blimp:passive_2", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="passive_2", hf_avail_splits=["train"], @@ -5218,7 +5219,7 @@ blimp_principle_A_c_command_lighteval = LightevalTaskConfig( name="blimp:principle_A_c_command", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="principle_A_c_command", hf_avail_splits=["train"], @@ -5236,7 +5237,7 @@ blimp_principle_A_c_command_helm = LightevalTaskConfig( name="blimp:principle_A_c_command", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="principle_A_c_command", hf_avail_splits=["train"], @@ -5254,7 +5255,7 @@ blimp_principle_A_case_1_lighteval = LightevalTaskConfig( name="blimp:principle_A_case_1", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="principle_A_case_1", hf_avail_splits=["train"], @@ -5272,7 +5273,7 @@ blimp_principle_A_case_1_helm = LightevalTaskConfig( name="blimp:principle_A_case_1", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="principle_A_case_1", hf_avail_splits=["train"], @@ -5290,7 +5291,7 @@ blimp_principle_A_case_2_lighteval = LightevalTaskConfig( name="blimp:principle_A_case_2", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="principle_A_case_2", hf_avail_splits=["train"], @@ -5308,7 +5309,7 @@ blimp_principle_A_case_2_helm = LightevalTaskConfig( name="blimp:principle_A_case_2", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="principle_A_case_2", hf_avail_splits=["train"], @@ -5326,7 +5327,7 @@ blimp_principle_A_domain_1_lighteval = LightevalTaskConfig( name="blimp:principle_A_domain_1", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="principle_A_domain_1", hf_avail_splits=["train"], @@ -5344,7 +5345,7 @@ blimp_principle_A_domain_1_helm = LightevalTaskConfig( name="blimp:principle_A_domain_1", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="principle_A_domain_1", hf_avail_splits=["train"], @@ -5362,7 +5363,7 @@ blimp_principle_A_domain_2_lighteval = LightevalTaskConfig( name="blimp:principle_A_domain_2", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="principle_A_domain_2", hf_avail_splits=["train"], @@ -5380,7 +5381,7 @@ blimp_principle_A_domain_2_helm = LightevalTaskConfig( name="blimp:principle_A_domain_2", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="principle_A_domain_2", hf_avail_splits=["train"], @@ -5398,7 +5399,7 @@ blimp_principle_A_domain_3_lighteval = LightevalTaskConfig( name="blimp:principle_A_domain_3", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="principle_A_domain_3", hf_avail_splits=["train"], @@ -5416,7 +5417,7 @@ blimp_principle_A_domain_3_helm = LightevalTaskConfig( name="blimp:principle_A_domain_3", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="principle_A_domain_3", hf_avail_splits=["train"], @@ -5434,7 +5435,7 @@ blimp_principle_A_reconstruction_lighteval = LightevalTaskConfig( name="blimp:principle_A_reconstruction", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="principle_A_reconstruction", hf_avail_splits=["train"], @@ -5452,7 +5453,7 @@ blimp_principle_A_reconstruction_helm = LightevalTaskConfig( name="blimp:principle_A_reconstruction", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="principle_A_reconstruction", hf_avail_splits=["train"], @@ -5470,7 +5471,7 @@ blimp_regular_plural_subject_verb_agreement_1_lighteval = LightevalTaskConfig( name="blimp:regular_plural_subject_verb_agreement_1", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="regular_plural_subject_verb_agreement_1", hf_avail_splits=["train"], @@ -5488,7 +5489,7 @@ blimp_regular_plural_subject_verb_agreement_1_helm = LightevalTaskConfig( name="blimp:regular_plural_subject_verb_agreement_1", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="regular_plural_subject_verb_agreement_1", hf_avail_splits=["train"], @@ -5506,7 +5507,7 @@ blimp_regular_plural_subject_verb_agreement_2_lighteval = LightevalTaskConfig( name="blimp:regular_plural_subject_verb_agreement_2", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="regular_plural_subject_verb_agreement_2", hf_avail_splits=["train"], @@ -5524,7 +5525,7 @@ blimp_regular_plural_subject_verb_agreement_2_helm = LightevalTaskConfig( name="blimp:regular_plural_subject_verb_agreement_2", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="regular_plural_subject_verb_agreement_2", hf_avail_splits=["train"], @@ -5542,7 +5543,7 @@ blimp_sentential_negation_npi_licensor_present_lighteval = LightevalTaskConfig( name="blimp:sentential_negation_npi_licensor_present", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="sentential_negation_npi_licensor_present", hf_avail_splits=["train"], @@ -5560,7 +5561,7 @@ blimp_sentential_negation_npi_licensor_present_helm = LightevalTaskConfig( name="blimp:sentential_negation_npi_licensor_present", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="sentential_negation_npi_licensor_present", hf_avail_splits=["train"], @@ -5578,7 +5579,7 @@ blimp_sentential_negation_npi_scope_lighteval = LightevalTaskConfig( name="blimp:sentential_negation_npi_scope", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="sentential_negation_npi_scope", hf_avail_splits=["train"], @@ -5596,7 +5597,7 @@ blimp_sentential_negation_npi_scope_helm = LightevalTaskConfig( name="blimp:sentential_negation_npi_scope", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="sentential_negation_npi_scope", hf_avail_splits=["train"], @@ -5614,7 +5615,7 @@ blimp_sentential_subject_island_lighteval = LightevalTaskConfig( name="blimp:sentential_subject_island", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="sentential_subject_island", hf_avail_splits=["train"], @@ -5632,7 +5633,7 @@ blimp_sentential_subject_island_helm = LightevalTaskConfig( name="blimp:sentential_subject_island", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="sentential_subject_island", hf_avail_splits=["train"], @@ -5650,7 +5651,7 @@ blimp_superlative_quantifiers_1_lighteval = LightevalTaskConfig( name="blimp:superlative_quantifiers_1", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="superlative_quantifiers_1", hf_avail_splits=["train"], @@ -5668,7 +5669,7 @@ blimp_superlative_quantifiers_1_helm = LightevalTaskConfig( name="blimp:superlative_quantifiers_1", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="superlative_quantifiers_1", hf_avail_splits=["train"], @@ -5686,7 +5687,7 @@ blimp_superlative_quantifiers_2_lighteval = LightevalTaskConfig( name="blimp:superlative_quantifiers_2", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="superlative_quantifiers_2", hf_avail_splits=["train"], @@ -5704,7 +5705,7 @@ blimp_superlative_quantifiers_2_helm = LightevalTaskConfig( name="blimp:superlative_quantifiers_2", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="superlative_quantifiers_2", hf_avail_splits=["train"], @@ -5722,7 +5723,7 @@ blimp_tough_vs_raising_1_lighteval = LightevalTaskConfig( name="blimp:tough_vs_raising_1", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="tough_vs_raising_1", hf_avail_splits=["train"], @@ -5740,7 +5741,7 @@ blimp_tough_vs_raising_1_helm = LightevalTaskConfig( name="blimp:tough_vs_raising_1", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="tough_vs_raising_1", hf_avail_splits=["train"], @@ -5758,7 +5759,7 @@ blimp_tough_vs_raising_2_lighteval = LightevalTaskConfig( name="blimp:tough_vs_raising_2", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="tough_vs_raising_2", hf_avail_splits=["train"], @@ -5776,7 +5777,7 @@ blimp_tough_vs_raising_2_helm = LightevalTaskConfig( name="blimp:tough_vs_raising_2", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="tough_vs_raising_2", hf_avail_splits=["train"], @@ -5794,7 +5795,7 @@ blimp_transitive_lighteval = LightevalTaskConfig( name="blimp:transitive", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="transitive", hf_avail_splits=["train"], @@ -5812,7 +5813,7 @@ blimp_transitive_helm = LightevalTaskConfig( name="blimp:transitive", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="transitive", hf_avail_splits=["train"], @@ -5830,7 +5831,7 @@ blimp_wh_island_lighteval = LightevalTaskConfig( name="blimp:wh_island", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="wh_island", hf_avail_splits=["train"], @@ -5848,7 +5849,7 @@ blimp_wh_island_helm = LightevalTaskConfig( name="blimp:wh_island", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="wh_island", hf_avail_splits=["train"], @@ -5866,7 +5867,7 @@ blimp_wh_questions_object_gap_lighteval = LightevalTaskConfig( name="blimp:wh_questions_object_gap", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="wh_questions_object_gap", hf_avail_splits=["train"], @@ -5884,7 +5885,7 @@ blimp_wh_questions_object_gap_helm = LightevalTaskConfig( name="blimp:wh_questions_object_gap", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="wh_questions_object_gap", hf_avail_splits=["train"], @@ -5902,7 +5903,7 @@ blimp_wh_questions_subject_gap_lighteval = LightevalTaskConfig( name="blimp:wh_questions_subject_gap", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="wh_questions_subject_gap", hf_avail_splits=["train"], @@ -5920,7 +5921,7 @@ blimp_wh_questions_subject_gap_helm = LightevalTaskConfig( name="blimp:wh_questions_subject_gap", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="wh_questions_subject_gap", hf_avail_splits=["train"], @@ -5938,7 +5939,7 @@ blimp_wh_questions_subject_gap_long_distance_lighteval = LightevalTaskConfig( name="blimp:wh_questions_subject_gap_long_distance", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="wh_questions_subject_gap_long_distance", hf_avail_splits=["train"], @@ -5956,7 +5957,7 @@ blimp_wh_questions_subject_gap_long_distance_helm = LightevalTaskConfig( name="blimp:wh_questions_subject_gap_long_distance", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="wh_questions_subject_gap_long_distance", hf_avail_splits=["train"], @@ -5974,7 +5975,7 @@ blimp_wh_vs_that_no_gap_lighteval = LightevalTaskConfig( name="blimp:wh_vs_that_no_gap", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="wh_vs_that_no_gap", hf_avail_splits=["train"], @@ -5992,7 +5993,7 @@ blimp_wh_vs_that_no_gap_helm = LightevalTaskConfig( name="blimp:wh_vs_that_no_gap", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="wh_vs_that_no_gap", hf_avail_splits=["train"], @@ -6010,7 +6011,7 @@ blimp_wh_vs_that_no_gap_long_distance_lighteval = LightevalTaskConfig( name="blimp:wh_vs_that_no_gap_long_distance", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="wh_vs_that_no_gap_long_distance", hf_avail_splits=["train"], @@ -6028,7 +6029,7 @@ blimp_wh_vs_that_no_gap_long_distance_helm = LightevalTaskConfig( name="blimp:wh_vs_that_no_gap_long_distance", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="wh_vs_that_no_gap_long_distance", hf_avail_splits=["train"], @@ -6046,7 +6047,7 @@ blimp_wh_vs_that_with_gap_lighteval = LightevalTaskConfig( name="blimp:wh_vs_that_with_gap", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="wh_vs_that_with_gap", hf_avail_splits=["train"], @@ -6064,7 +6065,7 @@ blimp_wh_vs_that_with_gap_helm = LightevalTaskConfig( name="blimp:wh_vs_that_with_gap", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="wh_vs_that_with_gap", hf_avail_splits=["train"], @@ -6082,7 +6083,7 @@ blimp_wh_vs_that_with_gap_long_distance_lighteval = LightevalTaskConfig( name="blimp:wh_vs_that_with_gap_long_distance", suite=["lighteval", "blimp"], - prompt_function="blimp", + prompt_function=prompt.blimp, hf_repo="blimp", hf_subset="wh_vs_that_with_gap_long_distance", hf_avail_splits=["train"], @@ -6100,7 +6101,7 @@ blimp_wh_vs_that_with_gap_long_distance_helm = LightevalTaskConfig( name="blimp:wh_vs_that_with_gap_long_distance", suite=["helm", "blimp"], - prompt_function="blimp_helm", + prompt_function=prompt.blimp_helm, hf_repo="blimp", hf_subset="wh_vs_that_with_gap_long_distance", hf_avail_splits=["train"], @@ -6118,7 +6119,7 @@ bold_helm = LightevalTaskConfig( name="bold", suite=["helm"], - prompt_function="bold", + prompt_function=prompt.bold, hf_repo="lighteval/bold_helm", hf_subset="all", hf_avail_splits=["train", "test"], @@ -6136,7 +6137,7 @@ bold_gender_helm = LightevalTaskConfig( name="bold:gender", suite=["helm"], - prompt_function="bold", + prompt_function=prompt.bold, hf_repo="lighteval/bold_helm", hf_subset="gender", hf_avail_splits=["train", "test"], @@ -6154,7 +6155,7 @@ bold_political_ideology_helm = LightevalTaskConfig( name="bold:political_ideology", suite=["helm"], - prompt_function="bold", + prompt_function=prompt.bold, hf_repo="lighteval/bold_helm", hf_subset="political_ideology", hf_avail_splits=["train", "test"], @@ -6172,7 +6173,7 @@ bold_profession_helm = LightevalTaskConfig( name="bold:profession", suite=["helm"], - prompt_function="bold", + prompt_function=prompt.bold, hf_repo="lighteval/bold_helm", hf_subset="profession", hf_avail_splits=["train", "test"], @@ -6190,7 +6191,7 @@ bold_race_helm = LightevalTaskConfig( name="bold:race", suite=["helm"], - prompt_function="bold", + prompt_function=prompt.bold, hf_repo="lighteval/bold_helm", hf_subset="race", hf_avail_splits=["train", "test"], @@ -6208,7 +6209,7 @@ bold_religious_ideology_helm = LightevalTaskConfig( name="bold:religious_ideology", suite=["helm"], - prompt_function="bold", + prompt_function=prompt.bold, hf_repo="lighteval/bold_helm", hf_subset="religious_ideology", hf_avail_splits=["train", "test"], @@ -6226,7 +6227,7 @@ boolq_helm = LightevalTaskConfig( name="boolq", suite=["helm", "helm_general"], - prompt_function="boolq_helm", + prompt_function=prompt.boolq_helm, hf_repo="lighteval/boolq_helm", hf_subset="default", hf_avail_splits=["train", "validation"], @@ -6244,7 +6245,7 @@ boolq_contrastset_helm = LightevalTaskConfig( name="boolq:contrastset", suite=["helm"], - prompt_function="boolq_helm_contrastset", + prompt_function=prompt.boolq_helm_contrastset, hf_repo="lighteval/boolq_helm", hf_subset="default", hf_avail_splits=["validation"], @@ -6262,7 +6263,7 @@ bridging_anaphora_resolution_barqa_bigbench = LightevalTaskConfig( name="bridging_anaphora_resolution_barqa", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="bridging_anaphora_resolution_barqa", hf_avail_splits=["default", "train", "validation"], @@ -6280,7 +6281,7 @@ causal_judgment_bigbench = LightevalTaskConfig( name="causal_judgment", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="causal_judgment", hf_avail_splits=["default", "train", "validation"], @@ -6298,7 +6299,7 @@ cause_and_effect_bigbench = LightevalTaskConfig( name="cause_and_effect", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="cause_and_effect", hf_avail_splits=["default", "train", "validation"], @@ -6316,7 +6317,7 @@ checkmate_in_one_bigbench = LightevalTaskConfig( name="checkmate_in_one", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="checkmate_in_one", hf_avail_splits=["default", "train", "validation"], @@ -6334,7 +6335,7 @@ chess_state_tracking_bigbench = LightevalTaskConfig( name="chess_state_tracking", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="chess_state_tracking", hf_avail_splits=["default", "train", "validation"], @@ -6352,7 +6353,7 @@ chinese_remainder_theorem_bigbench = LightevalTaskConfig( name="chinese_remainder_theorem", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="chinese_remainder_theorem", hf_avail_splits=["default", "train", "validation"], @@ -6370,7 +6371,7 @@ cifar10_classification_bigbench = LightevalTaskConfig( name="cifar10_classification", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="cifar10_classification", hf_avail_splits=["default", "train", "validation"], @@ -6388,7 +6389,7 @@ civil_comments_helm = LightevalTaskConfig( name="civil_comments", suite=["helm", "helm_general"], - prompt_function="civil_comments", + prompt_function=prompt.civil_comments, hf_repo="lighteval/civil_comments_helm", hf_subset="all", hf_avail_splits=["train", "test"], @@ -6413,7 +6414,7 @@ civil_comments_LGBTQ_helm = LightevalTaskConfig( name="civil_comments:LGBTQ", suite=["helm"], - prompt_function="civil_comments", + prompt_function=prompt.civil_comments, hf_repo="lighteval/civil_comments_helm", hf_subset="LGBTQ", hf_avail_splits=["train", "test"], @@ -6438,7 +6439,7 @@ civil_comments_black_helm = LightevalTaskConfig( name="civil_comments:black", suite=["helm"], - prompt_function="civil_comments", + prompt_function=prompt.civil_comments, hf_repo="lighteval/civil_comments_helm", hf_subset="black", hf_avail_splits=["train", "test"], @@ -6463,7 +6464,7 @@ civil_comments_christian_helm = LightevalTaskConfig( name="civil_comments:christian", suite=["helm"], - prompt_function="civil_comments", + prompt_function=prompt.civil_comments, hf_repo="lighteval/civil_comments_helm", hf_subset="christian", hf_avail_splits=["train", "test"], @@ -6488,7 +6489,7 @@ civil_comments_female_helm = LightevalTaskConfig( name="civil_comments:female", suite=["helm"], - prompt_function="civil_comments", + prompt_function=prompt.civil_comments, hf_repo="lighteval/civil_comments_helm", hf_subset="female", hf_avail_splits=["train", "test"], @@ -6513,7 +6514,7 @@ civil_comments_male_helm = LightevalTaskConfig( name="civil_comments:male", suite=["helm"], - prompt_function="civil_comments", + prompt_function=prompt.civil_comments, hf_repo="lighteval/civil_comments_helm", hf_subset="male", hf_avail_splits=["train", "test"], @@ -6538,7 +6539,7 @@ civil_comments_muslim_helm = LightevalTaskConfig( name="civil_comments:muslim", suite=["helm"], - prompt_function="civil_comments", + prompt_function=prompt.civil_comments, hf_repo="lighteval/civil_comments_helm", hf_subset="muslim", hf_avail_splits=["train", "test"], @@ -6563,7 +6564,7 @@ civil_comments_other_religions_helm = LightevalTaskConfig( name="civil_comments:other_religions", suite=["helm"], - prompt_function="civil_comments", + prompt_function=prompt.civil_comments, hf_repo="lighteval/civil_comments_helm", hf_subset="other_religions", hf_avail_splits=["train", "test"], @@ -6588,7 +6589,7 @@ civil_comments_white_helm = LightevalTaskConfig( name="civil_comments:white", suite=["helm"], - prompt_function="civil_comments", + prompt_function=prompt.civil_comments, hf_repo="lighteval/civil_comments_helm", hf_subset="white", hf_avail_splits=["train", "test"], @@ -6613,7 +6614,7 @@ code_line_description_bigbench_lite = LightevalTaskConfig( name="code_line_description", suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function="bigbench_linefeed_before_and_after_query", + prompt_function=prompt.bigbench_linefeed_before_and_after_query, hf_repo="bigbench", hf_subset="code_line_description", hf_avail_splits=["default", "train", "validation"], @@ -6631,7 +6632,7 @@ codenames_bigbench = LightevalTaskConfig( name="codenames", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="codenames", hf_avail_splits=["default", "train", "validation"], @@ -6649,7 +6650,7 @@ color_bigbench = LightevalTaskConfig( name="color", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="color", hf_avail_splits=["default", "train", "validation"], @@ -6667,7 +6668,7 @@ common_morpheme_bigbench = LightevalTaskConfig( name="common_morpheme", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="common_morpheme", hf_avail_splits=["default", "train", "validation"], @@ -6685,7 +6686,7 @@ commonsenseqa_helm = LightevalTaskConfig( name="commonsenseqa", suite=["helm", "commonsense_scenario"], - prompt_function="commonsense_qa", + prompt_function=prompt.commonsense_qa, hf_repo="commonsense_qa", hf_subset="default", hf_avail_splits=["train", "test", "validation"], @@ -6703,7 +6704,7 @@ conceptual_combinations_bigbench_lite = LightevalTaskConfig( name="conceptual_combinations", suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function="bigbench_linefeed_before_whitespace_after_query", + prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query, hf_repo="bigbench", hf_subset="conceptual_combinations", hf_avail_splits=["default", "train", "validation"], @@ -6721,7 +6722,7 @@ conlang_translation_bigbench_lite = LightevalTaskConfig( name="conlang_translation", suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function="bigbench_whitespace_after_query", + prompt_function=prompt.bigbench_whitespace_after_query, hf_repo="bigbench", hf_subset="conlang_translation", hf_avail_splits=["default", "train", "validation"], @@ -6738,7 +6739,7 @@ contextual_parametric_knowledge_conflicts_bigbench = LightevalTaskConfig( name="contextual_parametric_knowledge_conflicts", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="contextual_parametric_knowledge_conflicts", hf_avail_splits=["default", "train", "validation"], @@ -6756,7 +6757,7 @@ copyright_n_books_1000_extractions_per_book_1_prefix_length_125_helm = LightevalTaskConfig( name="copyright:n_books_1000-extractions_per_book_1-prefix_length_125", suite=["helm", "copyright_scenario"], - prompt_function="copyright", + prompt_function=prompt.copyright, hf_repo="lighteval/copyright_helm", hf_subset="n_books_1000-extractions_per_book_1-prefix_length_125", hf_avail_splits=["train"], @@ -6774,7 +6775,7 @@ copyright_n_books_1000_extractions_per_book_1_prefix_length_25_helm = LightevalTaskConfig( name="copyright:n_books_1000-extractions_per_book_1-prefix_length_25", suite=["helm", "copyright_scenario"], - prompt_function="copyright", + prompt_function=prompt.copyright, hf_repo="lighteval/copyright_helm", hf_subset="n_books_1000-extractions_per_book_1-prefix_length_25", hf_avail_splits=["train"], @@ -6792,7 +6793,7 @@ copyright_n_books_1000_extractions_per_book_1_prefix_length_5_helm = LightevalTaskConfig( name="copyright:n_books_1000-extractions_per_book_1-prefix_length_5", suite=["helm", "copyright_scenario"], - prompt_function="copyright", + prompt_function=prompt.copyright, hf_repo="lighteval/copyright_helm", hf_subset="n_books_1000-extractions_per_book_1-prefix_length_5", hf_avail_splits=["train"], @@ -6810,7 +6811,7 @@ copyright_n_books_1000_extractions_per_book_3_prefix_length_125_helm = LightevalTaskConfig( name="copyright:n_books_1000-extractions_per_book_3-prefix_length_125", suite=["helm", "copyright_scenario"], - prompt_function="copyright", + prompt_function=prompt.copyright, hf_repo="lighteval/copyright_helm", hf_subset="n_books_1000-extractions_per_book_3-prefix_length_125", hf_avail_splits=["train"], @@ -6828,7 +6829,7 @@ copyright_n_books_1000_extractions_per_book_3_prefix_length_25_helm = LightevalTaskConfig( name="copyright:n_books_1000-extractions_per_book_3-prefix_length_25", suite=["helm", "copyright_scenario"], - prompt_function="copyright", + prompt_function=prompt.copyright, hf_repo="lighteval/copyright_helm", hf_subset="n_books_1000-extractions_per_book_3-prefix_length_25", hf_avail_splits=["train"], @@ -6846,7 +6847,7 @@ copyright_n_books_1000_extractions_per_book_3_prefix_length_5_helm = LightevalTaskConfig( name="copyright:n_books_1000-extractions_per_book_3-prefix_length_5", suite=["helm", "copyright_scenario"], - prompt_function="copyright", + prompt_function=prompt.copyright, hf_repo="lighteval/copyright_helm", hf_subset="n_books_1000-extractions_per_book_3-prefix_length_5", hf_avail_splits=["train"], @@ -6864,7 +6865,7 @@ copyright_oh_the_places_helm = LightevalTaskConfig( name="copyright:oh_the_places", suite=["helm", "copyright_scenario"], - prompt_function="copyright", + prompt_function=prompt.copyright, hf_repo="lighteval/copyright_helm", hf_subset="oh_the_places", hf_avail_splits=["train"], @@ -6882,7 +6883,7 @@ copyright_pilot_helm = LightevalTaskConfig( name="copyright:pilot", suite=["helm", "copyright_scenario"], - prompt_function="copyright", + prompt_function=prompt.copyright, hf_repo="lighteval/copyright_helm", hf_subset="pilot", hf_avail_splits=["train"], @@ -6900,7 +6901,7 @@ copyright_popular_books_prefix_length_10_helm = LightevalTaskConfig( name="copyright:popular_books-prefix_length_10", suite=["helm", "copyright_scenario"], - prompt_function="copyright", + prompt_function=prompt.copyright, hf_repo="lighteval/copyright_helm", hf_subset="popular_books-prefix_length_10", hf_avail_splits=["train"], @@ -6918,7 +6919,7 @@ copyright_popular_books_prefix_length_125_helm = LightevalTaskConfig( name="copyright:popular_books-prefix_length_125", suite=["helm", "copyright_scenario"], - prompt_function="copyright", + prompt_function=prompt.copyright, hf_repo="lighteval/copyright_helm", hf_subset="popular_books-prefix_length_125", hf_avail_splits=["train"], @@ -6936,7 +6937,7 @@ copyright_popular_books_prefix_length_25_helm = LightevalTaskConfig( name="copyright:popular_books-prefix_length_25", suite=["helm", "copyright_scenario"], - prompt_function="copyright", + prompt_function=prompt.copyright, hf_repo="lighteval/copyright_helm", hf_subset="popular_books-prefix_length_25", hf_avail_splits=["train"], @@ -6954,7 +6955,7 @@ copyright_popular_books_prefix_length_250_helm = LightevalTaskConfig( name="copyright:popular_books-prefix_length_250", suite=["helm", "copyright_scenario"], - prompt_function="copyright", + prompt_function=prompt.copyright, hf_repo="lighteval/copyright_helm", hf_subset="popular_books-prefix_length_250", hf_avail_splits=["train"], @@ -6972,7 +6973,7 @@ copyright_popular_books_prefix_length_5_helm = LightevalTaskConfig( name="copyright:popular_books-prefix_length_5", suite=["helm", "copyright_scenario"], - prompt_function="copyright", + prompt_function=prompt.copyright, hf_repo="lighteval/copyright_helm", hf_subset="popular_books-prefix_length_5", hf_avail_splits=["train"], @@ -6990,7 +6991,7 @@ copyright_popular_books_prefix_length_50_helm = LightevalTaskConfig( name="copyright:popular_books-prefix_length_50", suite=["helm", "copyright_scenario"], - prompt_function="copyright", + prompt_function=prompt.copyright, hf_repo="lighteval/copyright_helm", hf_subset="popular_books-prefix_length_50", hf_avail_splits=["train"], @@ -7008,7 +7009,7 @@ copyright_prompt_num_line_1_min_lines_20_helm = LightevalTaskConfig( name="copyright:prompt_num_line_1-min_lines_20", suite=["helm", "copyright_scenario"], - prompt_function="copyright", + prompt_function=prompt.copyright, hf_repo="lighteval/copyright_helm", hf_subset="prompt_num_line_1-min_lines_20", hf_avail_splits=["train"], @@ -7026,7 +7027,7 @@ copyright_prompt_num_line_10_min_lines_20_helm = LightevalTaskConfig( name="copyright:prompt_num_line_10-min_lines_20", suite=["helm", "copyright_scenario"], - prompt_function="copyright", + prompt_function=prompt.copyright, hf_repo="lighteval/copyright_helm", hf_subset="prompt_num_line_10-min_lines_20", hf_avail_splits=["train"], @@ -7044,7 +7045,7 @@ copyright_prompt_num_line_5_min_lines_20_helm = LightevalTaskConfig( name="copyright:prompt_num_line_5-min_lines_20", suite=["helm", "copyright_scenario"], - prompt_function="copyright", + prompt_function=prompt.copyright, hf_repo="lighteval/copyright_helm", hf_subset="prompt_num_line_5-min_lines_20", hf_avail_splits=["train"], @@ -7062,7 +7063,7 @@ coqa_lighteval = LightevalTaskConfig( name="coqa", suite=["lighteval"], - prompt_function="coqa", + prompt_function=prompt.coqa, hf_repo="coqa", hf_subset="default", hf_avail_splits=["train", "validation"], @@ -7080,7 +7081,7 @@ coqa_bb_lighteval = LightevalTaskConfig( name="coqa_bb", suite=["lighteval", "bigbench_programmatic", "bigbench"], - prompt_function="coqa", + prompt_function=prompt.coqa, hf_repo="coqa", hf_subset="default", hf_avail_splits=["train", "validation"], @@ -7098,7 +7099,7 @@ covid_dialogue_helm = LightevalTaskConfig( name="covid_dialogue", suite=["helm"], - prompt_function="covid_dialogue", + prompt_function=prompt.covid_dialogue, hf_repo="lighteval/covid_dialogue", hf_subset="default", hf_avail_splits=["train", "test", "validation"], @@ -7116,7 +7117,7 @@ crash_blossom_bigbench = LightevalTaskConfig( name="crash_blossom", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="crash_blossom", hf_avail_splits=["default", "train", "validation"], @@ -7134,7 +7135,7 @@ crass_ai_bigbench = LightevalTaskConfig( name="crass_ai", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="crass_ai", hf_avail_splits=["default", "train", "validation"], @@ -7152,7 +7153,7 @@ cryobiology_spanish_bigbench = LightevalTaskConfig( name="cryobiology_spanish", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="cryobiology_spanish", hf_avail_splits=["default", "train", "validation"], @@ -7170,7 +7171,7 @@ cryptonite_bigbench = LightevalTaskConfig( name="cryptonite", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="cryptonite", hf_avail_splits=["default", "train", "validation"], @@ -7188,7 +7189,7 @@ cs_algorithms_bigbench = LightevalTaskConfig( name="cs_algorithms", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="cs_algorithms", hf_avail_splits=["default", "train", "validation"], @@ -7206,7 +7207,7 @@ dark_humor_detection_bigbench = LightevalTaskConfig( name="dark_humor_detection", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="dark_humor_detection", hf_avail_splits=["default", "train", "validation"], @@ -7224,7 +7225,7 @@ date_understanding_bigbench = LightevalTaskConfig( name="date_understanding", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="date_understanding", hf_avail_splits=["default", "train", "validation"], @@ -7242,7 +7243,7 @@ disambiguation_qa_bigbench = LightevalTaskConfig( name="disambiguation_qa", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="disambiguation_qa", hf_avail_splits=["default", "train", "validation"], @@ -7260,7 +7261,7 @@ discourse_marker_prediction_bigbench = LightevalTaskConfig( name="discourse_marker_prediction", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="discourse_marker_prediction", hf_avail_splits=["default", "train", "validation"], @@ -7278,7 +7279,7 @@ disfl_qa_bigbench = LightevalTaskConfig( name="disfl_qa", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="disfl_qa", hf_avail_splits=["default", "train", "validation"], @@ -7296,7 +7297,7 @@ drop_lighteval = LightevalTaskConfig( name="drop", suite=["lighteval"], - prompt_function="drop", + prompt_function=prompt.drop, hf_repo="lighteval/drop_harness", hf_subset="default", hf_avail_splits=["train", "validation"], @@ -7314,7 +7315,7 @@ dyck_language_2_helm = LightevalTaskConfig( name="dyck_language:2", suite=["helm"], - prompt_function="dyck_language", + prompt_function=prompt.dyck_language, hf_repo="lighteval/DyckLanguage", hf_subset="2", hf_avail_splits=["train", "test"], @@ -7332,7 +7333,7 @@ dyck_language_3_helm = LightevalTaskConfig( name="dyck_language:3", suite=["helm"], - prompt_function="dyck_language", + prompt_function=prompt.dyck_language, hf_repo="lighteval/DyckLanguage", hf_subset="3", hf_avail_splits=["train", "test"], @@ -7350,7 +7351,7 @@ dyck_language_4_helm = LightevalTaskConfig( name="dyck_language:4", suite=["helm"], - prompt_function="dyck_language", + prompt_function=prompt.dyck_language, hf_repo="lighteval/DyckLanguage", hf_subset="4", hf_avail_splits=["train", "test"], @@ -7368,7 +7369,7 @@ dyck_languages_bigbench = LightevalTaskConfig( name="dyck_languages", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="dyck_languages", hf_avail_splits=["default", "train", "validation"], @@ -7386,7 +7387,7 @@ elementary_math_qa_bigbench = LightevalTaskConfig( name="elementary_math_qa", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="elementary_math_qa", hf_avail_splits=["default", "train", "validation"], @@ -7404,7 +7405,7 @@ emoji_movie_bigbench_lite = LightevalTaskConfig( name="emoji_movie", suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function="bigbench_linefeed_before_whitespace_after_query", + prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query, hf_repo="bigbench", hf_subset="emoji_movie", hf_avail_splits=["default", "train", "validation"], @@ -7422,7 +7423,7 @@ emojis_emotion_prediction_bigbench = LightevalTaskConfig( name="emojis_emotion_prediction", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="emojis_emotion_prediction", hf_avail_splits=["default", "train", "validation"], @@ -7440,7 +7441,7 @@ empirical_judgments_bigbench = LightevalTaskConfig( name="empirical_judgments", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="empirical_judgments", hf_avail_splits=["default", "train", "validation"], @@ -7458,7 +7459,7 @@ english_proverbs_bigbench = LightevalTaskConfig( name="english_proverbs", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="english_proverbs", hf_avail_splits=["default", "train", "validation"], @@ -7476,7 +7477,7 @@ english_russian_proverbs_bigbench = LightevalTaskConfig( name="english_russian_proverbs", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="english_russian_proverbs", hf_avail_splits=["default", "train", "validation"], @@ -7494,7 +7495,7 @@ entailed_polarity_bigbench = LightevalTaskConfig( name="entailed_polarity", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="entailed_polarity", hf_avail_splits=["default", "train", "validation"], @@ -7512,7 +7513,7 @@ entailed_polarity_hindi_bigbench = LightevalTaskConfig( name="entailed_polarity_hindi", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="entailed_polarity_hindi", hf_avail_splits=["default", "train", "validation"], @@ -7530,7 +7531,7 @@ entity_data_imputation_Buy_helm = LightevalTaskConfig( name="entity_data_imputation:Buy", suite=["helm"], - prompt_function="entity_data_imputation", + prompt_function=prompt.entity_data_imputation, hf_repo="lighteval/Buy", hf_subset="default", hf_avail_splits=["train", "test", "valid"], @@ -7548,7 +7549,7 @@ entity_data_imputation_Restaurant_helm = LightevalTaskConfig( name="entity_data_imputation:Restaurant", suite=["helm"], - prompt_function="entity_data_imputation", + prompt_function=prompt.entity_data_imputation, hf_repo="lighteval/Restaurant", hf_subset="default", hf_avail_splits=["train"], @@ -7566,7 +7567,7 @@ entity_matching_Abt_Buy_helm = LightevalTaskConfig( name="entity_matching:Abt_Buy", suite=["helm"], - prompt_function="entity_matching", + prompt_function=prompt.entity_matching, hf_repo="lighteval/EntityMatching", hf_subset="Abt_Buy", hf_avail_splits=["train", "test", "validation"], @@ -7584,7 +7585,7 @@ entity_matching_Amazon_Google_helm = LightevalTaskConfig( name="entity_matching:Amazon_Google", suite=["helm"], - prompt_function="entity_matching", + prompt_function=prompt.entity_matching, hf_repo="lighteval/EntityMatching", hf_subset="Amazon_Google", hf_avail_splits=["train", "test", "validation"], @@ -7602,7 +7603,7 @@ entity_matching_Beer_helm = LightevalTaskConfig( name="entity_matching:Beer", suite=["helm"], - prompt_function="entity_matching", + prompt_function=prompt.entity_matching, hf_repo="lighteval/EntityMatching", hf_subset="Beer", hf_avail_splits=["train", "test", "validation"], @@ -7620,7 +7621,7 @@ entity_matching_Company_helm = LightevalTaskConfig( name="entity_matching:Company", suite=["helm"], - prompt_function="entity_matching", + prompt_function=prompt.entity_matching, hf_repo="lighteval/EntityMatching", hf_subset="Company", hf_avail_splits=["train", "test", "validation"], @@ -7638,7 +7639,7 @@ entity_matching_DBLP_ACM_helm = LightevalTaskConfig( name="entity_matching:DBLP_ACM", suite=["helm"], - prompt_function="entity_matching", + prompt_function=prompt.entity_matching, hf_repo="lighteval/EntityMatching", hf_subset="DBLP_ACM", hf_avail_splits=["train", "test", "validation"], @@ -7656,7 +7657,7 @@ entity_matching_DBLP_GoogleScholar_helm = LightevalTaskConfig( name="entity_matching:DBLP_GoogleScholar", suite=["helm"], - prompt_function="entity_matching", + prompt_function=prompt.entity_matching, hf_repo="lighteval/EntityMatching", hf_subset="DBLP_GoogleScholar", hf_avail_splits=["train", "test", "validation"], @@ -7674,7 +7675,7 @@ entity_matching_Dirty_DBLP_ACM_helm = LightevalTaskConfig( name="entity_matching:Dirty_DBLP_ACM", suite=["helm"], - prompt_function="entity_matching", + prompt_function=prompt.entity_matching, hf_repo="lighteval/EntityMatching", hf_subset="Dirty_DBLP_ACM", hf_avail_splits=["train", "test", "validation"], @@ -7692,7 +7693,7 @@ entity_matching_Dirty_DBLP_GoogleScholar_helm = LightevalTaskConfig( name="entity_matching:Dirty_DBLP_GoogleScholar", suite=["helm"], - prompt_function="entity_matching", + prompt_function=prompt.entity_matching, hf_repo="lighteval/EntityMatching", hf_subset="Dirty_DBLP_GoogleScholar", hf_avail_splits=["train", "test", "validation"], @@ -7710,7 +7711,7 @@ entity_matching_Dirty_Walmart_Amazon_helm = LightevalTaskConfig( name="entity_matching:Dirty_Walmart_Amazon", suite=["helm"], - prompt_function="entity_matching", + prompt_function=prompt.entity_matching, hf_repo="lighteval/EntityMatching", hf_subset="Dirty_Walmart_Amazon", hf_avail_splits=["train", "test", "validation"], @@ -7728,7 +7729,7 @@ entity_matching_Dirty_iTunes_Amazon_helm = LightevalTaskConfig( name="entity_matching:Dirty_iTunes_Amazon", suite=["helm"], - prompt_function="entity_matching", + prompt_function=prompt.entity_matching, hf_repo="lighteval/EntityMatching", hf_subset="Dirty_iTunes_Amazon", hf_avail_splits=["train", "test", "validation"], @@ -7746,7 +7747,7 @@ entity_matching_Fodors_Zagats_helm = LightevalTaskConfig( name="entity_matching=Fodors_Zagats", suite=["helm"], - prompt_function="entity_matching", + prompt_function=prompt.entity_matching, hf_repo="lighteval/EntityMatching", hf_subset="Fodors_Zagats", hf_avail_splits=["train", "test", "validation"], @@ -7764,7 +7765,7 @@ entity_matching_Walmart_Amazon_helm = LightevalTaskConfig( name="entity_matching:Walmart_Amazon", suite=["helm"], - prompt_function="entity_matching", + prompt_function=prompt.entity_matching, hf_repo="lighteval/EntityMatching", hf_subset="Walmart_Amazon", hf_avail_splits=["train", "test", "validation"], @@ -7782,7 +7783,7 @@ entity_matching_iTunes_Amazon_helm = LightevalTaskConfig( name="entity_matching:iTunes_Amazon", suite=["helm"], - prompt_function="entity_matching", + prompt_function=prompt.entity_matching, hf_repo="lighteval/EntityMatching", hf_subset="iTunes_Amazon", hf_avail_splits=["train", "test", "validation"], @@ -7800,7 +7801,7 @@ epistemic_reasoning_bigbench = LightevalTaskConfig( name="epistemic_reasoning", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="epistemic_reasoning", hf_avail_splits=["default", "train", "validation"], @@ -7818,7 +7819,7 @@ ethics_commonsense_lighteval = LightevalTaskConfig( name="ethics:commonsense", suite=["lighteval", "ethics"], - prompt_function="ethics_commonsense", + prompt_function=prompt.ethics_commonsense, hf_repo="lighteval/hendrycks_ethics", hf_subset="commonsense", hf_avail_splits=["train", "validation", "test"], @@ -7836,7 +7837,7 @@ ethics_deontology_lighteval = LightevalTaskConfig( name="ethics:deontology", suite=["lighteval", "ethics"], - prompt_function="ethics_deontology", + prompt_function=prompt.ethics_deontology, hf_repo="lighteval/hendrycks_ethics", hf_subset="deontology", hf_avail_splits=["train", "validation", "test"], @@ -7854,7 +7855,7 @@ ethics_justice_lighteval = LightevalTaskConfig( name="ethics:justice", suite=["lighteval", "ethics"], - prompt_function="ethics_justice", + prompt_function=prompt.ethics_justice, hf_repo="lighteval/hendrycks_ethics", hf_subset="justice", hf_avail_splits=["train", "validation", "test"], @@ -7872,7 +7873,7 @@ ethics_utilitarianism_lighteval = LightevalTaskConfig( name="ethics:utilitarianism", suite=["lighteval", "ethics"], - prompt_function="ethics_utilitarianism", + prompt_function=prompt.ethics_utilitarianism, hf_repo="lighteval/hendrycks_ethics", hf_subset="utilitarianism", hf_avail_splits=["train", "validation", "test"], @@ -7890,7 +7891,7 @@ ethics_virtue_lighteval = LightevalTaskConfig( name="ethics:virtue", suite=["lighteval", "ethics"], - prompt_function="ethics_virtue", + prompt_function=prompt.ethics_virtue, hf_repo="lighteval/hendrycks_ethics", hf_subset="virtue", hf_avail_splits=["train", "validation", "test"], @@ -7908,7 +7909,7 @@ evaluating_information_essentiality_bigbench = LightevalTaskConfig( name="evaluating_information_essentiality", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="evaluating_information_essentiality", hf_avail_splits=["default", "train", "validation"], @@ -7926,7 +7927,7 @@ fact_checker_bigbench = LightevalTaskConfig( name="fact_checker", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="fact_checker", hf_avail_splits=["default", "train", "validation"], @@ -7944,7 +7945,7 @@ fantasy_reasoning_bigbench = LightevalTaskConfig( name="fantasy_reasoning", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="fantasy_reasoning", hf_avail_splits=["default", "train", "validation"], @@ -7962,7 +7963,7 @@ few_shot_nlg_bigbench = LightevalTaskConfig( name="few_shot_nlg", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="few_shot_nlg", hf_avail_splits=["default", "train", "validation"], @@ -7980,7 +7981,7 @@ figure_of_speech_detection_bigbench = LightevalTaskConfig( name="figure_of_speech_detection", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="figure_of_speech_detection", hf_avail_splits=["default", "train", "validation"], @@ -7998,7 +7999,7 @@ formal_fallacies_syllogisms_negation_bigbench_lite = LightevalTaskConfig( name="formal_fallacies_syllogisms_negation", suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function="bigbench_linefeed_before_whitespace_after_query", + prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query, hf_repo="bigbench", hf_subset="formal_fallacies_syllogisms_negation", hf_avail_splits=["default", "train", "validation"], @@ -8016,7 +8017,7 @@ gem_bigbench = LightevalTaskConfig( name="gem", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="gem", hf_avail_splits=["default", "train", "validation"], @@ -8034,7 +8035,7 @@ gender_inclusive_sentences_german_bigbench = LightevalTaskConfig( name="gender_inclusive_sentences_german", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="gender_inclusive_sentences_german", hf_avail_splits=["default", "train", "validation"], @@ -8052,7 +8053,7 @@ general_knowledge_bigbench = LightevalTaskConfig( name="general_knowledge", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="general_knowledge", hf_avail_splits=["default", "train", "validation"], @@ -8070,7 +8071,7 @@ geometric_shapes_bigbench = LightevalTaskConfig( name="geometric_shapes", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="geometric_shapes", hf_avail_splits=["default", "train", "validation"], @@ -8088,7 +8089,7 @@ glue_cola_lighteval = LightevalTaskConfig( name="glue:cola", suite=["lighteval", "glue"], - prompt_function="cola", + prompt_function=prompt.cola, hf_repo="glue", hf_subset="cola", hf_avail_splits=["test", "train", "validation"], @@ -8106,7 +8107,7 @@ glue_mnli_lighteval = LightevalTaskConfig( name="glue:mnli", suite=["lighteval", "glue"], - prompt_function="mnli", + prompt_function=prompt.mnli, hf_repo="glue", hf_subset="mnli_matched", hf_avail_splits=["train", "validation"], @@ -8124,7 +8125,7 @@ glue_mnli_mismatched_lighteval = LightevalTaskConfig( name="glue:mnli_mismatched", suite=["lighteval", "glue"], - prompt_function="mnli", + prompt_function=prompt.mnli, hf_repo="glue", hf_subset="mnli_mismatched", hf_avail_splits=["train", "validation"], @@ -8142,7 +8143,7 @@ glue_mrpc_lighteval = LightevalTaskConfig( name="glue:mrpc", suite=["lighteval", "glue"], - prompt_function="mrpc", + prompt_function=prompt.mrpc, hf_repo="glue", hf_subset="mrpc", hf_avail_splits=["test", "train", "validation"], @@ -8160,7 +8161,7 @@ glue_qnli_lighteval = LightevalTaskConfig( name="glue:qnli", suite=["lighteval", "glue"], - prompt_function="qnli", + prompt_function=prompt.qnli, hf_repo="glue", hf_subset="qnli", hf_avail_splits=["test", "train", "validation"], @@ -8178,7 +8179,7 @@ glue_qqp_lighteval = LightevalTaskConfig( name="glue:qqp", suite=["lighteval", "glue"], - prompt_function="qqp", + prompt_function=prompt.qqp, hf_repo="glue", hf_subset="qqp", hf_avail_splits=["train", "validation", "test"], @@ -8196,7 +8197,7 @@ glue_rte_lighteval = LightevalTaskConfig( name="glue:rte", suite=["lighteval", "glue"], - prompt_function="rte", + prompt_function=prompt.rte, hf_repo="glue", hf_subset="rte", hf_avail_splits=["test", "train", "validation"], @@ -8214,7 +8215,7 @@ glue_sst2_lighteval = LightevalTaskConfig( name="glue:sst2", suite=["lighteval", "glue"], - prompt_function="sst", + prompt_function=prompt.sst, hf_repo="glue", hf_subset="sst2", hf_avail_splits=["test", "train", "validation"], @@ -8232,7 +8233,7 @@ glue_stsb_lighteval = LightevalTaskConfig( name="glue:stsb", suite=["lighteval", "glue"], - prompt_function="stsb", + prompt_function=prompt.stsb, hf_repo="glue", hf_subset="stsb", hf_avail_splits=["test", "train", "validation"], @@ -8250,7 +8251,7 @@ glue_wnli_lighteval = LightevalTaskConfig( name="glue:wnli", suite=["lighteval", "glue"], - prompt_function="wnli", + prompt_function=prompt.wnli, hf_repo="glue", hf_subset="wnli", hf_avail_splits=["test", "train", "validation"], @@ -8268,7 +8269,7 @@ goal_step_wikihow_bigbench = LightevalTaskConfig( name="goal_step_wikihow", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="goal_step_wikihow", hf_avail_splits=["default", "train", "validation"], @@ -8286,7 +8287,7 @@ gpqa_lighteval = LightevalTaskConfig( name="gpqa", suite=["lighteval"], - prompt_function="gpqa", + prompt_function=prompt.gpqa, hf_repo="Idavidrein/gpqa", hf_subset="gpqa_main", hf_avail_splits=["train"], @@ -8304,7 +8305,7 @@ gre_reading_comprehension_bigbench = LightevalTaskConfig( name="gre_reading_comprehension", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="gre_reading_comprehension", hf_avail_splits=["default", "train", "validation"], @@ -8322,7 +8323,7 @@ gsm8k_leaderboard = LightevalTaskConfig( name="gsm8k", suite=["leaderboard"], - prompt_function="gsm8k", + prompt_function=prompt.gsm8k, hf_repo="gsm8k", hf_subset="main", hf_avail_splits=["train", "test"], @@ -8340,7 +8341,7 @@ gsm8k_lighteval = LightevalTaskConfig( name="gsm8k", suite=["lighteval"], - prompt_function="gsm8k", + prompt_function=prompt.gsm8k, hf_repo="gsm8k", hf_subset="main", hf_avail_splits=["train", "test"], @@ -8358,7 +8359,7 @@ headqa_en_lighteval = LightevalTaskConfig( name="headqa:en", suite=["lighteval", "headqa"], - prompt_function="headqa", + prompt_function=prompt.headqa, hf_repo="lighteval/headqa_harness", hf_subset="en", hf_avail_splits=["train", "test", "validation"], @@ -8376,7 +8377,7 @@ headqa_es_lighteval = LightevalTaskConfig( name="headqa:es", suite=["lighteval", "headqa"], - prompt_function="headqa", + prompt_function=prompt.headqa, hf_repo="lighteval/headqa_harness", hf_subset="es", hf_avail_splits=["train", "test", "validation"], @@ -8394,7 +8395,7 @@ hellaswag_leaderboard = LightevalTaskConfig( name="hellaswag", suite=["leaderboard"], - prompt_function="hellaswag_harness", + prompt_function=prompt.hellaswag_harness, hf_repo="hellaswag", hf_subset="default", hf_avail_splits=["train", "test", "validation"], @@ -8412,7 +8413,7 @@ hellaswag_helm = LightevalTaskConfig( name="hellaswag", suite=["helm", "helm_general"], - prompt_function="hellaswag_helm", + prompt_function=prompt.hellaswag_helm, hf_repo="hellaswag", hf_subset="default", hf_avail_splits=["train", "test", "validation"], @@ -8430,7 +8431,7 @@ hhh_alignment_bigbench = LightevalTaskConfig( name="hhh_alignment", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="hhh_alignment", hf_avail_splits=["default", "train", "validation"], @@ -8448,7 +8449,7 @@ hindi_question_answering_bigbench = LightevalTaskConfig( name="hindi_question_answering", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="hindi_question_answering", hf_avail_splits=["default", "train", "validation"], @@ -8466,7 +8467,7 @@ hindu_knowledge_bigbench_lite = LightevalTaskConfig( name="hindu_knowledge", suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function="bigbench_linefeed_before_whitespace_after_query", + prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query, hf_repo="bigbench", hf_subset="hindu_knowledge", hf_avail_splits=["default", "train", "validation"], @@ -8484,7 +8485,7 @@ hinglish_toxicity_bigbench = LightevalTaskConfig( name="hinglish_toxicity", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="hinglish_toxicity", hf_avail_splits=["default", "train", "validation"], @@ -8502,7 +8503,7 @@ human_organs_senses_bigbench = LightevalTaskConfig( name="human_organs_senses", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="human_organs_senses", hf_avail_splits=["default", "train", "validation"], @@ -8520,7 +8521,7 @@ humaneval_helm = LightevalTaskConfig( name="humaneval", suite=["helm", "code_scenario"], - prompt_function="humaneval", + prompt_function=prompt.humaneval, hf_repo="openai_humaneval", hf_subset="openai_humaneval", hf_avail_splits=["test"], @@ -8538,7 +8539,7 @@ hyperbaton_bigbench = LightevalTaskConfig( name="hyperbaton", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="hyperbaton", hf_avail_splits=["default", "train", "validation"], @@ -8556,7 +8557,7 @@ identify_math_theorems_bigbench = LightevalTaskConfig( name="identify_math_theorems", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="identify_math_theorems", hf_avail_splits=["default", "train", "validation"], @@ -8574,7 +8575,7 @@ identify_odd_metaphor_bigbench = LightevalTaskConfig( name="identify_odd_metaphor", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="identify_odd_metaphor", hf_avail_splits=["default", "train", "validation"], @@ -8592,7 +8593,7 @@ imdb_helm = LightevalTaskConfig( name="imdb", suite=["helm", "helm_general"], - prompt_function="imdb", + prompt_function=prompt.imdb, hf_repo="lighteval/IMDB_helm", hf_subset="default", hf_avail_splits=["train", "test"], @@ -8617,7 +8618,7 @@ imdb_contrastset_helm = LightevalTaskConfig( name="imdb:contrastset", suite=["helm"], - prompt_function="imdb_contrastset", + prompt_function=prompt.imdb_contrastset, hf_repo="lighteval/IMDB_helm", hf_subset="default", hf_avail_splits=["test"], @@ -8642,7 +8643,7 @@ implicatures_bigbench = LightevalTaskConfig( name="implicatures", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="implicatures", hf_avail_splits=["default", "train", "validation"], @@ -8660,7 +8661,7 @@ implicit_relations_bigbench = LightevalTaskConfig( name="implicit_relations", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="implicit_relations", hf_avail_splits=["default", "train", "validation"], @@ -8678,7 +8679,7 @@ intent_recognition_bigbench = LightevalTaskConfig( name="intent_recognition", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="intent_recognition", hf_avail_splits=["default", "train", "validation"], @@ -8696,7 +8697,7 @@ interactive_qa_mmlu_abstract_algebra_helm = LightevalTaskConfig( name="interactive_qa_mmlu:abstract_algebra", suite=["helm", "interactive_qa_mmlu_scenario"], - prompt_function="mmlu_qa_abstract_algebra", + prompt_function=prompt.mmlu_qa_abstract_algebra, hf_repo="lighteval/mmlu", hf_subset="abstract_algebra", hf_avail_splits=["dev", "test"], @@ -8714,7 +8715,7 @@ interactive_qa_mmlu_college_chemistry_helm = LightevalTaskConfig( name="interactive_qa_mmlu:college_chemistry", suite=["helm", "interactive_qa_mmlu_scenario"], - prompt_function="mmlu_qa_college_chemistry", + prompt_function=prompt.mmlu_qa_college_chemistry, hf_repo="lighteval/mmlu", hf_subset="college_chemistry", hf_avail_splits=["dev", "test"], @@ -8732,7 +8733,7 @@ interactive_qa_mmlu_global_facts_helm = LightevalTaskConfig( name="interactive_qa_mmlu:global_facts", suite=["helm", "interactive_qa_mmlu_scenario"], - prompt_function="mmlu_qa_global_facts", + prompt_function=prompt.mmlu_qa_global_facts, hf_repo="lighteval/mmlu", hf_subset="global_facts", hf_avail_splits=["dev", "test"], @@ -8750,7 +8751,7 @@ interactive_qa_mmlu_miscellaneous_helm = LightevalTaskConfig( name="interactive_qa_mmlu:miscellaneous", suite=["helm", "interactive_qa_mmlu_scenario"], - prompt_function="mmlu_qa_miscellaneous", + prompt_function=prompt.mmlu_qa_miscellaneous, hf_repo="lighteval/mmlu", hf_subset="miscellaneous", hf_avail_splits=["dev", "test"], @@ -8768,7 +8769,7 @@ interactive_qa_mmlu_nutrition_helm = LightevalTaskConfig( name="interactive_qa_mmlu:nutrition", suite=["helm", "interactive_qa_mmlu_scenario"], - prompt_function="mmlu_qa_nutrition", + prompt_function=prompt.mmlu_qa_nutrition, hf_repo="lighteval/mmlu", hf_subset="nutrition", hf_avail_splits=["dev", "test"], @@ -8786,7 +8787,7 @@ interactive_qa_mmlu_us_foreign_policy_helm = LightevalTaskConfig( name="interactive_qa_mmlu:us_foreign_policy", suite=["helm", "interactive_qa_mmlu_scenario"], - prompt_function="mmlu_qa_us_foreign_policy", + prompt_function=prompt.mmlu_qa_us_foreign_policy, hf_repo="lighteval/mmlu", hf_subset="us_foreign_policy", hf_avail_splits=["dev", "test"], @@ -8804,7 +8805,7 @@ international_phonetic_alphabet_nli_bigbench = LightevalTaskConfig( name="international_phonetic_alphabet_nli", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="international_phonetic_alphabet_nli", hf_avail_splits=["default", "train", "validation"], @@ -8822,7 +8823,7 @@ international_phonetic_alphabet_transliterate_bigbench = LightevalTaskConfig( name="international_phonetic_alphabet_transliterate", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="international_phonetic_alphabet_transliterate", hf_avail_splits=["default", "train", "validation"], @@ -8840,7 +8841,7 @@ intersect_geometry_bigbench = LightevalTaskConfig( name="intersect_geometry", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="intersect_geometry", hf_avail_splits=["default", "train", "validation"], @@ -8858,7 +8859,7 @@ irony_identification_bigbench = LightevalTaskConfig( name="irony_identification", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="irony_identification", hf_avail_splits=["default", "train", "validation"], @@ -8876,7 +8877,7 @@ iwslt17_ar_en_lighteval = LightevalTaskConfig( name="iwslt17:ar-en", suite=["lighteval", "harness_selection"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="iwslt17_ar-en", hf_avail_splits=["test"], @@ -8894,7 +8895,7 @@ iwslt17_de_en_lighteval = LightevalTaskConfig( name="iwslt17:de-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="iwslt17_de-en", hf_avail_splits=["test"], @@ -8912,7 +8913,7 @@ iwslt17_en_ar_lighteval = LightevalTaskConfig( name="iwslt17:en-ar", suite=["lighteval", "harness_selection"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="iwslt17_ar-en", hf_avail_splits=["test"], @@ -8930,7 +8931,7 @@ iwslt17_en_de_lighteval = LightevalTaskConfig( name="iwslt17:en-de", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="iwslt17_en-de", hf_avail_splits=["test"], @@ -8948,7 +8949,7 @@ iwslt17_en_fr_lighteval = LightevalTaskConfig( name="iwslt17:en-fr", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="iwslt17_en-fr", hf_avail_splits=["test"], @@ -8966,7 +8967,7 @@ iwslt17_en_ja_lighteval = LightevalTaskConfig( name="iwslt17:en-ja", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="iwslt17_en-ja", hf_avail_splits=["test"], @@ -8984,7 +8985,7 @@ iwslt17_en_ko_lighteval = LightevalTaskConfig( name="iwslt17:en-ko", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="iwslt17_en-ko", hf_avail_splits=["test"], @@ -9002,7 +9003,7 @@ iwslt17_en_zh_lighteval = LightevalTaskConfig( name="iwslt17:en-zh", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="iwslt17_en-zh", hf_avail_splits=["test"], @@ -9020,7 +9021,7 @@ iwslt17_fr_en_lighteval = LightevalTaskConfig( name="iwslt17:fr-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="iwslt17_fr-en", hf_avail_splits=["test"], @@ -9038,7 +9039,7 @@ iwslt17_ja_en_lighteval = LightevalTaskConfig( name="iwslt17:ja-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="iwslt17_ja-en", hf_avail_splits=["test"], @@ -9056,7 +9057,7 @@ iwslt17_ko_en_lighteval = LightevalTaskConfig( name="iwslt17:ko-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="iwslt17_ko-en", hf_avail_splits=["test"], @@ -9074,7 +9075,7 @@ iwslt17_zh_en_lighteval = LightevalTaskConfig( name="iwslt17:zh-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="iwslt17_zh-en", hf_avail_splits=["test"], @@ -9092,7 +9093,7 @@ kanji_ascii_bigbench = LightevalTaskConfig( name="kanji_ascii", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="kanji_ascii", hf_avail_splits=["default", "train", "validation"], @@ -9110,7 +9111,7 @@ kannada_bigbench = LightevalTaskConfig( name="kannada", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="kannada", hf_avail_splits=["default", "train", "validation"], @@ -9128,7 +9129,7 @@ key_value_maps_bigbench = LightevalTaskConfig( name="key_value_maps", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="key_value_maps", hf_avail_splits=["default", "train", "validation"], @@ -9146,7 +9147,7 @@ known_unknowns_bigbench_lite = LightevalTaskConfig( name="known_unknowns", suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function="bigbench_linefeed_before_whitespace_after_query", + prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query, hf_repo="bigbench", hf_subset="known_unknowns", hf_avail_splits=["default", "train", "validation"], @@ -9164,7 +9165,7 @@ lambada_standard_lighteval = LightevalTaskConfig( name="lambada:standard", suite=["lighteval", "lambada"], - prompt_function="lambada", + prompt_function=prompt.lambada, hf_repo="lambada", hf_subset="plain_text", hf_avail_splits=["train", "test", "validation"], @@ -9182,7 +9183,7 @@ lambada_standard_cloze_lighteval = LightevalTaskConfig( name="lambada:standard_cloze", suite=["lighteval", "lambada"], - prompt_function="lambada_cloze", + prompt_function=prompt.lambada_cloze, hf_repo="lambada", hf_subset="plain_text", hf_avail_splits=["train", "test", "validation"], @@ -9200,7 +9201,7 @@ lambada_openai_lighteval = LightevalTaskConfig( name="lambada:openai", suite=["lighteval", "lambada"], - prompt_function="lambada", + prompt_function=prompt.lambada, hf_repo="EleutherAI/lambada_openai", hf_subset="default", hf_avail_splits=["test"], @@ -9218,7 +9219,7 @@ lambada_openai_de_lighteval = LightevalTaskConfig( name="lambada:openai:de", suite=["lighteval", "lambada"], - prompt_function="lambada", + prompt_function=prompt.lambada, hf_repo="EleutherAI/lambada_openai", hf_subset="de", hf_avail_splits=["test"], @@ -9236,7 +9237,7 @@ lambada_openai_en_lighteval = LightevalTaskConfig( name="lambada:openai:en", suite=["lighteval", "lambada"], - prompt_function="lambada", + prompt_function=prompt.lambada, hf_repo="EleutherAI/lambada_openai", hf_subset="en", hf_avail_splits=["test"], @@ -9254,7 +9255,7 @@ lambada_openai_es_lighteval = LightevalTaskConfig( name="lambada:openai:es", suite=["lighteval", "lambada"], - prompt_function="lambada", + prompt_function=prompt.lambada, hf_repo="EleutherAI/lambada_openai", hf_subset="es", hf_avail_splits=["test"], @@ -9272,7 +9273,7 @@ lambada_openai_fr_lighteval = LightevalTaskConfig( name="lambada:openai:fr", suite=["lighteval", "lambada"], - prompt_function="lambada", + prompt_function=prompt.lambada, hf_repo="EleutherAI/lambada_openai", hf_subset="fr", hf_avail_splits=["test"], @@ -9290,7 +9291,7 @@ lambada_openai_it_lighteval = LightevalTaskConfig( name="lambada:openai:it", suite=["lighteval", "lambada"], - prompt_function="lambada", + prompt_function=prompt.lambada, hf_repo="EleutherAI/lambada_openai", hf_subset="it", hf_avail_splits=["test"], @@ -9308,7 +9309,7 @@ lambada_openai_cloze_lighteval = LightevalTaskConfig( name="lambada:openai_cloze", suite=["lighteval", "lambada"], - prompt_function="lambada_cloze", + prompt_function=prompt.lambada_cloze, hf_repo="EleutherAI/lambada_openai", hf_subset="en", hf_avail_splits=["test"], @@ -9326,7 +9327,7 @@ language_games_bigbench = LightevalTaskConfig( name="language_games", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="language_games", hf_avail_splits=["default", "train", "validation"], @@ -9344,7 +9345,7 @@ language_identification_bigbench_lite = LightevalTaskConfig( name="language_identification", suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="language_identification", hf_avail_splits=["default", "train", "validation"], @@ -9362,7 +9363,7 @@ legal_summarization_billsum_helm = LightevalTaskConfig( name="legal_summarization:billsum", suite=["helm"], - prompt_function="legal_summarization", + prompt_function=prompt.legal_summarization, hf_repo="lighteval/legal_summarization", hf_subset="BillSum", hf_avail_splits=["train", "test"], @@ -9380,7 +9381,7 @@ legal_summarization_eurlexsum_helm = LightevalTaskConfig( name="legal_summarization:eurlexsum", suite=["helm"], - prompt_function="legal_summarization", + prompt_function=prompt.legal_summarization, hf_repo="lighteval/legal_summarization", hf_subset="EurLexSum", hf_avail_splits=["train", "test", "validation"], @@ -9398,7 +9399,7 @@ legal_summarization_multilexsum_helm = LightevalTaskConfig( name="legal_summarization:multilexsum", suite=["helm"], - prompt_function="multilexsum", + prompt_function=prompt.multilexsum, hf_repo="lighteval/legal_summarization", hf_subset="MultiLexSum", hf_avail_splits=["train", "test", "validation"], @@ -9416,7 +9417,7 @@ legalsupport_helm = LightevalTaskConfig( name="legalsupport", suite=["helm"], - prompt_function="legal_support", + prompt_function=prompt.legal_support, hf_repo="lighteval/LegalSupport", hf_subset="default", hf_avail_splits=["train", "test", "validation"], @@ -9434,7 +9435,7 @@ lexglue_case_hold_helm = LightevalTaskConfig( name="lexglue:case_hold", suite=["helm", "lex_glue_scenario"], - prompt_function="lex_glue_case_hold", + prompt_function=prompt.lex_glue_case_hold, hf_repo="lighteval/lexglue", hf_subset="case_hold", hf_avail_splits=["train", "test", "validation"], @@ -9452,7 +9453,7 @@ lexglue_ecthr_a_helm = LightevalTaskConfig( name="lexglue:ecthr_a", suite=["helm", "lex_glue_scenario"], - prompt_function="lex_glue_ecthr_a", + prompt_function=prompt.lex_glue_ecthr_a, hf_repo="lighteval/lexglue", hf_subset="ecthr_a", hf_avail_splits=["train", "test", "validation"], @@ -9470,7 +9471,7 @@ lexglue_ecthr_b_helm = LightevalTaskConfig( name="lexglue:ecthr_b", suite=["helm", "lex_glue_scenario"], - prompt_function="lex_glue_ecthr_b", + prompt_function=prompt.lex_glue_ecthr_b, hf_repo="lighteval/lexglue", hf_subset="ecthr_b", hf_avail_splits=["train", "test", "validation"], @@ -9488,7 +9489,7 @@ lexglue_eurlex_helm = LightevalTaskConfig( name="lexglue:eurlex", suite=["helm", "lex_glue_scenario"], - prompt_function="lex_glue_eurlex", + prompt_function=prompt.lex_glue_eurlex, hf_repo="lighteval/lexglue", hf_subset="eurlex", hf_avail_splits=["train", "test", "validation"], @@ -9506,7 +9507,7 @@ lexglue_ledgar_helm = LightevalTaskConfig( name="lexglue:ledgar", suite=["helm", "lex_glue_scenario"], - prompt_function="lex_glue_ledgar", + prompt_function=prompt.lex_glue_ledgar, hf_repo="lighteval/lexglue", hf_subset="ledgar", hf_avail_splits=["train", "test", "validation"], @@ -9524,7 +9525,7 @@ lexglue_scotus_helm = LightevalTaskConfig( name="lexglue:scotus", suite=["helm", "lex_glue_scenario"], - prompt_function="lex_glue_scotus", + prompt_function=prompt.lex_glue_scotus, hf_repo="lighteval/lexglue", hf_subset="scotus", hf_avail_splits=["train", "test", "validation"], @@ -9542,7 +9543,7 @@ lexglue_unfair_tos_helm = LightevalTaskConfig( name="lexglue:unfair_tos", suite=["helm", "lex_glue_scenario"], - prompt_function="lex_glue_unfair_tos", + prompt_function=prompt.lex_glue_unfair_tos, hf_repo="lighteval/lexglue", hf_subset="unfair_tos", hf_avail_splits=["train", "test", "validation"], @@ -9560,7 +9561,7 @@ lextreme_brazilian_court_decisions_judgment_helm = LightevalTaskConfig( name="lextreme:brazilian_court_decisions_judgment", suite=["helm", "lextreme_scenario"], - prompt_function="lextreme_brazilian_court_decisions_judgment", + prompt_function=prompt.lextreme_brazilian_court_decisions_judgment, hf_repo="lighteval/lextreme", hf_subset="brazilian_court_decisions_judgment", hf_avail_splits=["train", "test", "validation"], @@ -9578,7 +9579,7 @@ lextreme_brazilian_court_decisions_unanimity_helm = LightevalTaskConfig( name="lextreme:brazilian_court_decisions_unanimity", suite=["helm", "lextreme_scenario"], - prompt_function="lextreme_brazilian_court_decisions_unanimity", + prompt_function=prompt.lextreme_brazilian_court_decisions_unanimity, hf_repo="lighteval/lextreme", hf_subset="brazilian_court_decisions_unanimity", hf_avail_splits=["train", "test", "validation"], @@ -9596,7 +9597,7 @@ lextreme_covid19_emergency_event_helm = LightevalTaskConfig( name="lextreme:covid19_emergency_event", suite=["helm", "lextreme_scenario"], - prompt_function="lextreme_covid19_emergency_event", + prompt_function=prompt.lextreme_covid19_emergency_event, hf_repo="lighteval/lextreme", hf_subset="covid19_emergency_event", hf_avail_splits=["train", "test", "validation"], @@ -9614,7 +9615,7 @@ lextreme_german_argument_mining_helm = LightevalTaskConfig( name="lextreme:german_argument_mining", suite=["helm", "lextreme_scenario"], - prompt_function="lextreme_german_argument_mining", + prompt_function=prompt.lextreme_german_argument_mining, hf_repo="lighteval/lextreme", hf_subset="german_argument_mining", hf_avail_splits=["train", "test", "validation"], @@ -9632,7 +9633,7 @@ lextreme_greek_legal_code_chapter_helm = LightevalTaskConfig( name="lextreme:greek_legal_code_chapter", suite=["helm", "lextreme_scenario"], - prompt_function="lextreme_greek_legal_code_chapter", + prompt_function=prompt.lextreme_greek_legal_code_chapter, hf_repo="lighteval/lextreme", hf_subset="greek_legal_code_chapter", hf_avail_splits=["train", "test", "validation"], @@ -9650,7 +9651,7 @@ lextreme_greek_legal_code_subject_helm = LightevalTaskConfig( name="lextreme:greek_legal_code_subject", suite=["helm", "lextreme_scenario"], - prompt_function="lextreme_greek_legal_code_subject", + prompt_function=prompt.lextreme_greek_legal_code_subject, hf_repo="lighteval/lextreme", hf_subset="greek_legal_code_subject", hf_avail_splits=["train", "test", "validation"], @@ -9668,7 +9669,7 @@ lextreme_greek_legal_code_volume_helm = LightevalTaskConfig( name="lextreme:greek_legal_code_volume", suite=["helm", "lextreme_scenario"], - prompt_function="lextreme_greek_legal_code_volume", + prompt_function=prompt.lextreme_greek_legal_code_volume, hf_repo="lighteval/lextreme", hf_subset="greek_legal_code_volume", hf_avail_splits=["train", "test", "validation"], @@ -9686,7 +9687,7 @@ lextreme_greek_legal_ner_helm = LightevalTaskConfig( name="lextreme:greek_legal_ner", suite=["helm", "lextreme_scenario"], - prompt_function="lextreme_greek_legal_ner", + prompt_function=prompt.lextreme_greek_legal_ner, hf_repo="lighteval/lextreme", hf_subset="greek_legal_ner", hf_avail_splits=["train", "test", "validation"], @@ -9704,7 +9705,7 @@ lextreme_legalnero_helm = LightevalTaskConfig( name="lextreme:legalnero", suite=["helm", "lextreme_scenario"], - prompt_function="lextreme_legalnero", + prompt_function=prompt.lextreme_legalnero, hf_repo="lighteval/lextreme", hf_subset="legalnero", hf_avail_splits=["train", "test", "validation"], @@ -9722,7 +9723,7 @@ lextreme_lener_br_helm = LightevalTaskConfig( name="lextreme:lener_br", suite=["helm", "lextreme_scenario"], - prompt_function="lextreme_lener_br", + prompt_function=prompt.lextreme_lener_br, hf_repo="lighteval/lextreme", hf_subset="lener_br", hf_avail_splits=["train", "test", "validation"], @@ -9740,7 +9741,7 @@ lextreme_mapa_coarse_helm = LightevalTaskConfig( name="lextreme:mapa_coarse", suite=["helm", "lextreme_scenario"], - prompt_function="lextreme_mapa_coarse", + prompt_function=prompt.lextreme_mapa_coarse, hf_repo="lighteval/lextreme", hf_subset="mapa_coarse", hf_avail_splits=["train", "test", "validation"], @@ -9758,7 +9759,7 @@ lextreme_mapa_fine_helm = LightevalTaskConfig( name="lextreme:mapa_fine", suite=["helm", "lextreme_scenario"], - prompt_function="lextreme_mapa_fine", + prompt_function=prompt.lextreme_mapa_fine, hf_repo="lighteval/lextreme", hf_subset="mapa_fine", hf_avail_splits=["train", "test", "validation"], @@ -9776,7 +9777,7 @@ lextreme_multi_eurlex_level_1_helm = LightevalTaskConfig( name="lextreme:multi_eurlex_level_1", suite=["helm", "lextreme_scenario"], - prompt_function="lextreme_multi_eurlex_level_1", + prompt_function=prompt.lextreme_multi_eurlex_level_1, hf_repo="lighteval/lextreme", hf_subset="multi_eurlex_level_1", hf_avail_splits=["train", "test", "validation"], @@ -9794,7 +9795,7 @@ lextreme_multi_eurlex_level_2_helm = LightevalTaskConfig( name="lextreme:multi_eurlex_level_2", suite=["helm", "lextreme_scenario"], - prompt_function="lextreme_multi_eurlex_level_2", + prompt_function=prompt.lextreme_multi_eurlex_level_2, hf_repo="lighteval/lextreme", hf_subset="multi_eurlex_level_2", hf_avail_splits=["train", "test", "validation"], @@ -9812,7 +9813,7 @@ lextreme_multi_eurlex_level_3_helm = LightevalTaskConfig( name="lextreme:multi_eurlex_level_3", suite=["helm", "lextreme_scenario"], - prompt_function="lextreme_multi_eurlex_level_3", + prompt_function=prompt.lextreme_multi_eurlex_level_3, hf_repo="lighteval/lextreme", hf_subset="multi_eurlex_level_3", hf_avail_splits=["train", "test", "validation"], @@ -9830,7 +9831,7 @@ lextreme_online_terms_of_service_clause_topics_helm = LightevalTaskConfig( name="lextreme:online_terms_of_service_clause_topics", suite=["helm", "lextreme_scenario"], - prompt_function="lextreme_online_terms_of_service_clause_topics", + prompt_function=prompt.lextreme_online_terms_of_service_clause_topics, hf_repo="lighteval/lextreme", hf_subset="online_terms_of_service_clause_topics", hf_avail_splits=["train", "test", "validation"], @@ -9848,7 +9849,7 @@ lextreme_online_terms_of_service_unfairness_levels_helm = LightevalTaskConfig( name="lextreme:online_terms_of_service_unfairness_levels", suite=["helm", "lextreme_scenario"], - prompt_function="lextreme_online_terms_of_service_unfairness_levels", + prompt_function=prompt.lextreme_online_terms_of_service_unfairness_levels, hf_repo="lighteval/lextreme", hf_subset="online_terms_of_service_unfairness_levels", hf_avail_splits=["train", "test", "validation"], @@ -9866,7 +9867,7 @@ lextreme_swiss_judgment_prediction_helm = LightevalTaskConfig( name="lextreme:swiss_judgment_prediction", suite=["helm", "lextreme_scenario"], - prompt_function="lextreme_swiss_judgment_prediction", + prompt_function=prompt.lextreme_swiss_judgment_prediction, hf_repo="lighteval/lextreme", hf_subset="swiss_judgment_prediction", hf_avail_splits=["train", "test", "validation"], @@ -9884,7 +9885,7 @@ linguistic_mappings_bigbench = LightevalTaskConfig( name="linguistic_mappings", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="linguistic_mappings", hf_avail_splits=["default", "train", "validation"], @@ -9902,7 +9903,7 @@ linguistics_puzzles_bigbench_lite = LightevalTaskConfig( name="linguistics_puzzles", suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function="bigbench_whitespace_after_query", + prompt_function=prompt.bigbench_whitespace_after_query, hf_repo="bigbench", hf_subset="linguistics_puzzles", hf_avail_splits=["default", "train", "validation"], @@ -9919,7 +9920,7 @@ logic_grid_puzzle_bigbench_lite = LightevalTaskConfig( name="logic_grid_puzzle", suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="logic_grid_puzzle", hf_avail_splits=["default", "train", "validation"], @@ -9937,7 +9938,7 @@ logical_args_bigbench = LightevalTaskConfig( name="logical_args", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="logical_args", hf_avail_splits=["default", "train", "validation"], @@ -9955,7 +9956,7 @@ logical_deduction_bigbench_lite = LightevalTaskConfig( name="logical_deduction", suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function="bigbench_whitespace_after_query", + prompt_function=prompt.bigbench_whitespace_after_query, hf_repo="bigbench", hf_subset="logical_deduction", hf_avail_splits=["default", "train", "validation"], @@ -9973,7 +9974,7 @@ logical_fallacy_detection_bigbench = LightevalTaskConfig( name="logical_fallacy_detection", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="logical_fallacy_detection", hf_avail_splits=["default", "train", "validation"], @@ -9991,7 +9992,7 @@ logical_sequence_bigbench = LightevalTaskConfig( name="logical_sequence", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="logical_sequence", hf_avail_splits=["default", "train", "validation"], @@ -10009,7 +10010,7 @@ logiqa_lighteval = LightevalTaskConfig( name="logiqa", suite=["lighteval"], - prompt_function="logiqa", + prompt_function=prompt.logiqa, hf_repo="lighteval/logiqa_harness", hf_subset="logiqa", hf_avail_splits=["train", "validation", "test"], @@ -10027,7 +10028,7 @@ lsat_qa_helm = LightevalTaskConfig( name="lsat_qa", suite=["helm", "lsat_qa_scenario"], - prompt_function="lsat_qa", + prompt_function=prompt.lsat_qa, hf_repo="lighteval/lsat_qa", hf_subset="all", hf_avail_splits=["train", "test", "validation"], @@ -10045,7 +10046,7 @@ lsat_qa_assignment_helm = LightevalTaskConfig( name="lsat_qa:assignment", suite=["helm", "lsat_qa_scenario"], - prompt_function="lsat_qa", + prompt_function=prompt.lsat_qa, hf_repo="lighteval/lsat_qa", hf_subset="assignment", hf_avail_splits=["train", "test", "validation"], @@ -10063,7 +10064,7 @@ lsat_qa_grouping_helm = LightevalTaskConfig( name="lsat_qa:grouping", suite=["helm", "lsat_qa_scenario"], - prompt_function="lsat_qa", + prompt_function=prompt.lsat_qa, hf_repo="lighteval/lsat_qa", hf_subset="grouping", hf_avail_splits=["train", "test", "validation"], @@ -10081,7 +10082,7 @@ lsat_qa_miscellaneous_helm = LightevalTaskConfig( name="lsat_qa:miscellaneous", suite=["helm", "lsat_qa_scenario"], - prompt_function="lsat_qa", + prompt_function=prompt.lsat_qa, hf_repo="lighteval/lsat_qa", hf_subset="miscellaneous", hf_avail_splits=["train", "test", "validation"], @@ -10099,7 +10100,7 @@ lsat_qa_ordering_helm = LightevalTaskConfig( name="lsat_qa:ordering", suite=["helm", "lsat_qa_scenario"], - prompt_function="lsat_qa", + prompt_function=prompt.lsat_qa, hf_repo="lighteval/lsat_qa", hf_subset="ordering", hf_avail_splits=["train", "test", "validation"], @@ -10117,7 +10118,7 @@ math_algebra_lighteval = LightevalTaskConfig( name="math:algebra", suite=["lighteval", "math"], - prompt_function="math", + prompt_function=prompt.math, hf_repo="lighteval/MATH", hf_subset="algebra", hf_avail_splits=["train", "test", "validation"], @@ -10135,7 +10136,7 @@ math_counting_and_probability_lighteval = LightevalTaskConfig( name="math:counting_and_probability", suite=["lighteval", "math"], - prompt_function="math", + prompt_function=prompt.math, hf_repo="lighteval/MATH", hf_subset="counting_and_probability", hf_avail_splits=["train", "test", "validation"], @@ -10153,7 +10154,7 @@ math_geometry_lighteval = LightevalTaskConfig( name="math:geometry", suite=["lighteval", "math"], - prompt_function="math", + prompt_function=prompt.math, hf_repo="lighteval/MATH", hf_subset="geometry", hf_avail_splits=["train", "test", "validation"], @@ -10171,7 +10172,7 @@ math_intermediate_algebra_lighteval = LightevalTaskConfig( name="math:intermediate_algebra", suite=["lighteval", "math"], - prompt_function="math", + prompt_function=prompt.math, hf_repo="lighteval/MATH", hf_subset="intermediate_algebra", hf_avail_splits=["train", "test", "validation"], @@ -10189,7 +10190,7 @@ math_number_theory_lighteval = LightevalTaskConfig( name="math:number_theory", suite=["lighteval", "math"], - prompt_function="math", + prompt_function=prompt.math, hf_repo="lighteval/MATH", hf_subset="number_theory", hf_avail_splits=["train", "test", "validation"], @@ -10207,7 +10208,7 @@ math_prealgebra_lighteval = LightevalTaskConfig( name="math:prealgebra", suite=["lighteval", "math"], - prompt_function="math", + prompt_function=prompt.math, hf_repo="lighteval/MATH", hf_subset="prealgebra", hf_avail_splits=["train", "test", "validation"], @@ -10225,7 +10226,7 @@ math_precalculus_lighteval = LightevalTaskConfig( name="math:precalculus", suite=["lighteval", "math"], - prompt_function="math", + prompt_function=prompt.math, hf_repo="lighteval/MATH", hf_subset="precalculus", hf_avail_splits=["train", "test", "validation"], @@ -10243,7 +10244,7 @@ math_cot_algebra_lighteval = LightevalTaskConfig( name="math_cot:algebra", suite=["lighteval", "math"], - prompt_function="math_cot", + prompt_function=prompt.math_cot, hf_repo="lighteval/MATH", hf_subset="algebra", hf_avail_splits=["train", "test", "validation"], @@ -10261,7 +10262,7 @@ math_cot_counting_and_probability_lighteval = LightevalTaskConfig( name="math_cot:counting_and_probability", suite=["lighteval", "math"], - prompt_function="math_cot", + prompt_function=prompt.math_cot, hf_repo="lighteval/MATH", hf_subset="counting_and_probability", hf_avail_splits=["train", "test", "validation"], @@ -10279,7 +10280,7 @@ math_cot_geometry_lighteval = LightevalTaskConfig( name="math_cot:geometry", suite=["lighteval", "math"], - prompt_function="math_cot", + prompt_function=prompt.math_cot, hf_repo="lighteval/MATH", hf_subset="geometry", hf_avail_splits=["train", "test", "validation"], @@ -10297,7 +10298,7 @@ math_cot_intermediate_algebra_lighteval = LightevalTaskConfig( name="math_cot:intermediate_algebra", suite=["lighteval", "math"], - prompt_function="math_cot", + prompt_function=prompt.math_cot, hf_repo="lighteval/MATH", hf_subset="intermediate_algebra", hf_avail_splits=["train", "test", "validation"], @@ -10315,7 +10316,7 @@ math_cot_number_theory_lighteval = LightevalTaskConfig( name="math_cot:number_theory", suite=["lighteval", "math"], - prompt_function="math_cot", + prompt_function=prompt.math_cot, hf_repo="lighteval/MATH", hf_subset="number_theory", hf_avail_splits=["train", "test", "validation"], @@ -10333,7 +10334,7 @@ math_cot_prealgebra_lighteval = LightevalTaskConfig( name="math_cot:prealgebra", suite=["lighteval", "math"], - prompt_function="math_cot", + prompt_function=prompt.math_cot, hf_repo="lighteval/MATH", hf_subset="prealgebra", hf_avail_splits=["train", "test", "validation"], @@ -10351,7 +10352,7 @@ math_cot_precalculus_lighteval = LightevalTaskConfig( name="math_cot:precalculus", suite=["lighteval", "math"], - prompt_function="math_cot", + prompt_function=prompt.math_cot, hf_repo="lighteval/MATH", hf_subset="precalculus", hf_avail_splits=["train", "test", "validation"], @@ -10369,7 +10370,7 @@ mathematical_induction_bigbench = LightevalTaskConfig( name="mathematical_induction", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="mathematical_induction", hf_avail_splits=["default", "train", "validation"], @@ -10387,7 +10388,7 @@ mathqa_lighteval = LightevalTaskConfig( name="mathqa", suite=["lighteval"], - prompt_function="mathqa", + prompt_function=prompt.mathqa, hf_repo="math_qa", hf_subset="default", hf_avail_splits=["train", "validation", "test"], @@ -10405,7 +10406,7 @@ matrixshapes_bigbench = LightevalTaskConfig( name="matrixshapes", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="matrixshapes", hf_avail_splits=["default", "train", "validation"], @@ -10423,7 +10424,7 @@ me_q_sum_helm = LightevalTaskConfig( name="me_q_sum", suite=["helm"], - prompt_function="me_q_sum", + prompt_function=prompt.me_q_sum, hf_repo="lighteval/me_q_sum", hf_subset="default", hf_avail_splits=["train", "test", "validation"], @@ -10441,7 +10442,7 @@ med_dialog_healthcaremagic_helm = LightevalTaskConfig( name="med_dialog:healthcaremagic", suite=["helm"], - prompt_function="med_dialog", + prompt_function=prompt.med_dialog, hf_repo="lighteval/med_dialog", hf_subset="healthcaremagic", hf_avail_splits=["train", "test", "validation"], @@ -10459,7 +10460,7 @@ med_dialog_icliniq_helm = LightevalTaskConfig( name="med_dialog:icliniq", suite=["helm"], - prompt_function="med_dialog", + prompt_function=prompt.med_dialog, hf_repo="lighteval/med_dialog", hf_subset="icliniq", hf_avail_splits=["train", "test", "validation"], @@ -10477,7 +10478,7 @@ med_mcqa_helm = LightevalTaskConfig( name="med_mcqa", suite=["helm"], - prompt_function="med_mcqa", + prompt_function=prompt.med_mcqa, hf_repo="lighteval/med_mcqa", hf_subset="default", hf_avail_splits=["train", "test", "validation"], @@ -10495,7 +10496,7 @@ med_paragraph_simplification_helm = LightevalTaskConfig( name="med_paragraph_simplification", suite=["helm"], - prompt_function="med_paragraph_simplification", + prompt_function=prompt.med_paragraph_simplification, hf_repo="lighteval/med_paragraph_simplification", hf_subset="default", hf_avail_splits=["train", "test", "validation"], @@ -10513,7 +10514,7 @@ med_qa_helm = LightevalTaskConfig( name="med_qa", suite=["helm"], - prompt_function="med_qa", + prompt_function=prompt.med_qa, hf_repo="bigbio/med_qa", hf_subset="med_qa_en_source", hf_avail_splits=["train", "test", "validation"], @@ -10531,7 +10532,7 @@ metaphor_boolean_bigbench = LightevalTaskConfig( name="metaphor_boolean", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="metaphor_boolean", hf_avail_splits=["default", "train", "validation"], @@ -10549,7 +10550,7 @@ metaphor_understanding_bigbench = LightevalTaskConfig( name="metaphor_understanding", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="metaphor_understanding", hf_avail_splits=["default", "train", "validation"], @@ -10567,7 +10568,7 @@ mgsm_en_lighteval = LightevalTaskConfig( name="mgsm:en", suite=["lighteval"], - prompt_function="mgsm_en", + prompt_function=prompt.mgsm_en, hf_repo="juletxara/mgsm", hf_subset="en", hf_avail_splits=["train", "test"], @@ -10585,7 +10586,7 @@ mgsm_es_lighteval = LightevalTaskConfig( name="mgsm:es", suite=["lighteval"], - prompt_function="mgsm_es", + prompt_function=prompt.mgsm_es, hf_repo="juletxara/mgsm", hf_subset="es", hf_avail_splits=["train", "test"], @@ -10603,7 +10604,7 @@ mgsm_fr_lighteval = LightevalTaskConfig( name="mgsm:fr", suite=["lighteval"], - prompt_function="mgsm_fr", + prompt_function=prompt.mgsm_fr, hf_repo="juletxara/mgsm", hf_subset="fr", hf_avail_splits=["train", "test"], @@ -10621,7 +10622,7 @@ mgsm_de_lighteval = LightevalTaskConfig( name="mgsm:de", suite=["lighteval"], - prompt_function="mgsm_de", + prompt_function=prompt.mgsm_de, hf_repo="juletxara/mgsm", hf_subset="de", hf_avail_splits=["train", "test"], @@ -10639,7 +10640,7 @@ mgsm_ru_lighteval = LightevalTaskConfig( name="mgsm:ru", suite=["lighteval"], - prompt_function="mgsm_ru", + prompt_function=prompt.mgsm_ru, hf_repo="juletxara/mgsm", hf_subset="ru", hf_avail_splits=["train", "test"], @@ -10657,7 +10658,7 @@ mgsm_zh_lighteval = LightevalTaskConfig( name="mgsm:zh", suite=["lighteval"], - prompt_function="mgsm_zh", + prompt_function=prompt.mgsm_zh, hf_repo="juletxara/mgsm", hf_subset="zh", hf_avail_splits=["train", "test"], @@ -10675,7 +10676,7 @@ mgsm_ja_lighteval = LightevalTaskConfig( name="mgsm:ja", suite=["lighteval"], - prompt_function="mgsm_ja", + prompt_function=prompt.mgsm_ja, hf_repo="juletxara/mgsm", hf_subset="ja", hf_avail_splits=["train", "test"], @@ -10693,7 +10694,7 @@ mgsm_th_lighteval = LightevalTaskConfig( name="mgsm:th", suite=["lighteval"], - prompt_function="mgsm_th", + prompt_function=prompt.mgsm_th, hf_repo="juletxara/mgsm", hf_subset="th", hf_avail_splits=["train", "test"], @@ -10711,7 +10712,7 @@ mgsm_sw_lighteval = LightevalTaskConfig( name="mgsm:sw", suite=["lighteval"], - prompt_function="mgsm_sw", + prompt_function=prompt.mgsm_sw, hf_repo="juletxara/mgsm", hf_subset="sw", hf_avail_splits=["train", "test"], @@ -10729,7 +10730,7 @@ mgsm_bn_lighteval = LightevalTaskConfig( name="mgsm:bn", suite=["lighteval"], - prompt_function="mgsm_bn", + prompt_function=prompt.mgsm_bn, hf_repo="juletxara/mgsm", hf_subset="bn", hf_avail_splits=["train", "test"], @@ -10747,7 +10748,7 @@ mgsm_te_lighteval = LightevalTaskConfig( name="mgsm:te", suite=["lighteval"], - prompt_function="mgsm_te", + prompt_function=prompt.mgsm_te, hf_repo="juletxara/mgsm", hf_subset="te", hf_avail_splits=["train", "test"], @@ -10765,7 +10766,7 @@ minute_mysteries_qa_bigbench = LightevalTaskConfig( name="minute_mysteries_qa", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="minute_mysteries_qa", hf_avail_splits=["default", "train", "validation"], @@ -10783,7 +10784,7 @@ misconceptions_bigbench = LightevalTaskConfig( name="misconceptions", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="misconceptions", hf_avail_splits=["default", "train", "validation"], @@ -10801,7 +10802,7 @@ misconceptions_russian_bigbench_lite = LightevalTaskConfig( name="misconceptions_russian", suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="misconceptions_russian", hf_avail_splits=["default", "train", "validation"], @@ -10819,7 +10820,7 @@ mmlu_helm = LightevalTaskConfig( name="mmlu", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="all", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -10837,7 +10838,7 @@ mmlu_original = LightevalTaskConfig( name="mmlu", suite=["original"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="all", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -10855,7 +10856,7 @@ mmlu_abstract_algebra_original = LightevalTaskConfig( name="mmlu:abstract_algebra", suite=["original", "mmlu"], - prompt_function="mmlu_abstract_algebra", + prompt_function=prompt.mmlu_abstract_algebra, hf_repo="cais/mmlu", hf_subset="abstract_algebra", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -10873,7 +10874,7 @@ mmlu_abstract_algebra_leaderboard = LightevalTaskConfig( name="mmlu:abstract_algebra", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="abstract_algebra", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -10891,7 +10892,7 @@ mmlu_abstract_algebra_helm = LightevalTaskConfig( name="mmlu:abstract_algebra", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="abstract_algebra", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -10909,7 +10910,7 @@ mmlu_anatomy_original = LightevalTaskConfig( name="mmlu:anatomy", suite=["original", "mmlu"], - prompt_function="mmlu_anatomy", + prompt_function=prompt.mmlu_anatomy, hf_repo="cais/mmlu", hf_subset="anatomy", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -10927,7 +10928,7 @@ mmlu_anatomy_leaderboard = LightevalTaskConfig( name="mmlu:anatomy", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="anatomy", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -10945,7 +10946,7 @@ mmlu_anatomy_helm = LightevalTaskConfig( name="mmlu:anatomy", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="anatomy", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -10963,7 +10964,7 @@ mmlu_astronomy_original = LightevalTaskConfig( name="mmlu:astronomy", suite=["original", "mmlu"], - prompt_function="mmlu_astronomy", + prompt_function=prompt.mmlu_astronomy, hf_repo="cais/mmlu", hf_subset="astronomy", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -10981,7 +10982,7 @@ mmlu_astronomy_leaderboard = LightevalTaskConfig( name="mmlu:astronomy", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="astronomy", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -10999,7 +11000,7 @@ mmlu_astronomy_helm = LightevalTaskConfig( name="mmlu:astronomy", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="astronomy", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11017,7 +11018,7 @@ mmlu_business_ethics_original = LightevalTaskConfig( name="mmlu:business_ethics", suite=["original", "mmlu"], - prompt_function="mmlu_business_ethics", + prompt_function=prompt.mmlu_business_ethics, hf_repo="cais/mmlu", hf_subset="business_ethics", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11035,7 +11036,7 @@ mmlu_business_ethics_leaderboard = LightevalTaskConfig( name="mmlu:business_ethics", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="business_ethics", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11053,7 +11054,7 @@ mmlu_business_ethics_helm = LightevalTaskConfig( name="mmlu:business_ethics", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="business_ethics", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11071,7 +11072,7 @@ mmlu_clinical_knowledge_original = LightevalTaskConfig( name="mmlu:clinical_knowledge", suite=["original", "mmlu"], - prompt_function="mmlu_clinical_knowledge", + prompt_function=prompt.mmlu_clinical_knowledge, hf_repo="cais/mmlu", hf_subset="clinical_knowledge", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11089,7 +11090,7 @@ mmlu_clinical_knowledge_leaderboard = LightevalTaskConfig( name="mmlu:clinical_knowledge", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="clinical_knowledge", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11107,7 +11108,7 @@ mmlu_clinical_knowledge_helm = LightevalTaskConfig( name="mmlu:clinical_knowledge", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="clinical_knowledge", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11125,7 +11126,7 @@ mmlu_college_biology_original = LightevalTaskConfig( name="mmlu:college_biology", suite=["original", "mmlu"], - prompt_function="mmlu_college_biology", + prompt_function=prompt.mmlu_college_biology, hf_repo="cais/mmlu", hf_subset="college_biology", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11143,7 +11144,7 @@ mmlu_college_biology_leaderboard = LightevalTaskConfig( name="mmlu:college_biology", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="college_biology", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11161,7 +11162,7 @@ mmlu_college_biology_helm = LightevalTaskConfig( name="mmlu:college_biology", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="college_biology", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11179,7 +11180,7 @@ mmlu_college_chemistry_original = LightevalTaskConfig( name="mmlu:college_chemistry", suite=["original", "mmlu"], - prompt_function="mmlu_college_chemistry", + prompt_function=prompt.mmlu_college_chemistry, hf_repo="cais/mmlu", hf_subset="college_chemistry", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11197,7 +11198,7 @@ mmlu_college_chemistry_leaderboard = LightevalTaskConfig( name="mmlu:college_chemistry", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="college_chemistry", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11215,7 +11216,7 @@ mmlu_college_chemistry_helm = LightevalTaskConfig( name="mmlu:college_chemistry", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="college_chemistry", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11233,7 +11234,7 @@ mmlu_college_computer_science_original = LightevalTaskConfig( name="mmlu:college_computer_science", suite=["original", "mmlu"], - prompt_function="mmlu_college_computer_science", + prompt_function=prompt.mmlu_college_computer_science, hf_repo="cais/mmlu", hf_subset="college_computer_science", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11251,7 +11252,7 @@ mmlu_college_computer_science_leaderboard = LightevalTaskConfig( name="mmlu:college_computer_science", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="college_computer_science", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11269,7 +11270,7 @@ mmlu_college_computer_science_helm = LightevalTaskConfig( name="mmlu:college_computer_science", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="college_computer_science", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11287,7 +11288,7 @@ mmlu_college_mathematics_original = LightevalTaskConfig( name="mmlu:college_mathematics", suite=["original", "mmlu"], - prompt_function="mmlu_college_mathematics", + prompt_function=prompt.mmlu_college_mathematics, hf_repo="cais/mmlu", hf_subset="college_mathematics", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11305,7 +11306,7 @@ mmlu_college_mathematics_leaderboard = LightevalTaskConfig( name="mmlu:college_mathematics", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="college_mathematics", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11323,7 +11324,7 @@ mmlu_college_mathematics_helm = LightevalTaskConfig( name="mmlu:college_mathematics", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="college_mathematics", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11341,7 +11342,7 @@ mmlu_college_medicine_original = LightevalTaskConfig( name="mmlu:college_medicine", suite=["original", "mmlu"], - prompt_function="mmlu_college_medicine", + prompt_function=prompt.mmlu_college_medicine, hf_repo="cais/mmlu", hf_subset="college_medicine", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11359,7 +11360,7 @@ mmlu_college_medicine_leaderboard = LightevalTaskConfig( name="mmlu:college_medicine", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="college_medicine", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11377,7 +11378,7 @@ mmlu_college_medicine_helm = LightevalTaskConfig( name="mmlu:college_medicine", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="college_medicine", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11395,7 +11396,7 @@ mmlu_college_physics_original = LightevalTaskConfig( name="mmlu:college_physics", suite=["original", "mmlu"], - prompt_function="mmlu_college_physics", + prompt_function=prompt.mmlu_college_physics, hf_repo="cais/mmlu", hf_subset="college_physics", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11413,7 +11414,7 @@ mmlu_college_physics_leaderboard = LightevalTaskConfig( name="mmlu:college_physics", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="college_physics", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11431,7 +11432,7 @@ mmlu_college_physics_helm = LightevalTaskConfig( name="mmlu:college_physics", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="college_physics", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11449,7 +11450,7 @@ mmlu_computer_security_original = LightevalTaskConfig( name="mmlu:computer_security", suite=["original", "mmlu"], - prompt_function="mmlu_computer_security", + prompt_function=prompt.mmlu_computer_security, hf_repo="cais/mmlu", hf_subset="computer_security", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11467,7 +11468,7 @@ mmlu_computer_security_leaderboard = LightevalTaskConfig( name="mmlu:computer_security", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="computer_security", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11485,7 +11486,7 @@ mmlu_computer_security_helm = LightevalTaskConfig( name="mmlu:computer_security", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="computer_security", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11503,7 +11504,7 @@ mmlu_conceptual_physics_original = LightevalTaskConfig( name="mmlu:conceptual_physics", suite=["original", "mmlu"], - prompt_function="mmlu_conceptual_physics", + prompt_function=prompt.mmlu_conceptual_physics, hf_repo="cais/mmlu", hf_subset="conceptual_physics", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11521,7 +11522,7 @@ mmlu_conceptual_physics_leaderboard = LightevalTaskConfig( name="mmlu:conceptual_physics", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="conceptual_physics", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11539,7 +11540,7 @@ mmlu_conceptual_physics_helm = LightevalTaskConfig( name="mmlu:conceptual_physics", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="conceptual_physics", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11557,7 +11558,7 @@ mmlu_econometrics_original = LightevalTaskConfig( name="mmlu:econometrics", suite=["original", "mmlu"], - prompt_function="mmlu_econometrics", + prompt_function=prompt.mmlu_econometrics, hf_repo="cais/mmlu", hf_subset="econometrics", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11575,7 +11576,7 @@ mmlu_econometrics_leaderboard = LightevalTaskConfig( name="mmlu:econometrics", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="econometrics", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11593,7 +11594,7 @@ mmlu_econometrics_helm = LightevalTaskConfig( name="mmlu:econometrics", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="econometrics", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11611,7 +11612,7 @@ mmlu_electrical_engineering_original = LightevalTaskConfig( name="mmlu:electrical_engineering", suite=["original", "mmlu"], - prompt_function="mmlu_electrical_engineering", + prompt_function=prompt.mmlu_electrical_engineering, hf_repo="cais/mmlu", hf_subset="electrical_engineering", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11629,7 +11630,7 @@ mmlu_electrical_engineering_leaderboard = LightevalTaskConfig( name="mmlu:electrical_engineering", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="electrical_engineering", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11647,7 +11648,7 @@ mmlu_electrical_engineering_helm = LightevalTaskConfig( name="mmlu:electrical_engineering", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="electrical_engineering", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11665,7 +11666,7 @@ mmlu_elementary_mathematics_original = LightevalTaskConfig( name="mmlu:elementary_mathematics", suite=["original", "mmlu"], - prompt_function="mmlu_elementary_mathematics", + prompt_function=prompt.mmlu_elementary_mathematics, hf_repo="cais/mmlu", hf_subset="elementary_mathematics", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11683,7 +11684,7 @@ mmlu_elementary_mathematics_leaderboard = LightevalTaskConfig( name="mmlu:elementary_mathematics", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="elementary_mathematics", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11701,7 +11702,7 @@ mmlu_elementary_mathematics_helm = LightevalTaskConfig( name="mmlu:elementary_mathematics", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="elementary_mathematics", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11719,7 +11720,7 @@ mmlu_formal_logic_original = LightevalTaskConfig( name="mmlu:formal_logic", suite=["original", "mmlu"], - prompt_function="mmlu_formal_logic", + prompt_function=prompt.mmlu_formal_logic, hf_repo="cais/mmlu", hf_subset="formal_logic", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11737,7 +11738,7 @@ mmlu_formal_logic_leaderboard = LightevalTaskConfig( name="mmlu:formal_logic", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="formal_logic", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11755,7 +11756,7 @@ mmlu_formal_logic_helm = LightevalTaskConfig( name="mmlu:formal_logic", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="formal_logic", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11773,7 +11774,7 @@ mmlu_global_facts_original = LightevalTaskConfig( name="mmlu:global_facts", suite=["original", "mmlu"], - prompt_function="mmlu_global_facts", + prompt_function=prompt.mmlu_global_facts, hf_repo="cais/mmlu", hf_subset="global_facts", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11791,7 +11792,7 @@ mmlu_global_facts_leaderboard = LightevalTaskConfig( name="mmlu:global_facts", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="global_facts", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11809,7 +11810,7 @@ mmlu_global_facts_helm = LightevalTaskConfig( name="mmlu:global_facts", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="global_facts", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11827,7 +11828,7 @@ mmlu_high_school_biology_original = LightevalTaskConfig( name="mmlu:high_school_biology", suite=["original", "mmlu"], - prompt_function="mmlu_high_school_biology", + prompt_function=prompt.mmlu_high_school_biology, hf_repo="cais/mmlu", hf_subset="high_school_biology", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11845,7 +11846,7 @@ mmlu_high_school_biology_leaderboard = LightevalTaskConfig( name="mmlu:high_school_biology", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="high_school_biology", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11863,7 +11864,7 @@ mmlu_high_school_biology_helm = LightevalTaskConfig( name="mmlu:high_school_biology", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="high_school_biology", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11881,7 +11882,7 @@ mmlu_high_school_chemistry_original = LightevalTaskConfig( name="mmlu:high_school_chemistry", suite=["original", "mmlu"], - prompt_function="mmlu_high_school_chemistry", + prompt_function=prompt.mmlu_high_school_chemistry, hf_repo="cais/mmlu", hf_subset="high_school_chemistry", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11899,7 +11900,7 @@ mmlu_high_school_chemistry_leaderboard = LightevalTaskConfig( name="mmlu:high_school_chemistry", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="high_school_chemistry", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11917,7 +11918,7 @@ mmlu_high_school_chemistry_helm = LightevalTaskConfig( name="mmlu:high_school_chemistry", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="high_school_chemistry", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11935,7 +11936,7 @@ mmlu_high_school_computer_science_original = LightevalTaskConfig( name="mmlu:high_school_computer_science", suite=["original", "mmlu"], - prompt_function="mmlu_high_school_computer_science", + prompt_function=prompt.mmlu_high_school_computer_science, hf_repo="cais/mmlu", hf_subset="high_school_computer_science", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11953,7 +11954,7 @@ mmlu_high_school_computer_science_leaderboard = LightevalTaskConfig( name="mmlu:high_school_computer_science", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="high_school_computer_science", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11971,7 +11972,7 @@ mmlu_high_school_computer_science_helm = LightevalTaskConfig( name="mmlu:high_school_computer_science", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="high_school_computer_science", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -11989,7 +11990,7 @@ mmlu_high_school_european_history_original = LightevalTaskConfig( name="mmlu:high_school_european_history", suite=["original", "mmlu"], - prompt_function="mmlu_high_school_european_history", + prompt_function=prompt.mmlu_high_school_european_history, hf_repo="cais/mmlu", hf_subset="high_school_european_history", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12007,7 +12008,7 @@ mmlu_high_school_european_history_leaderboard = LightevalTaskConfig( name="mmlu:high_school_european_history", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="high_school_european_history", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12025,7 +12026,7 @@ mmlu_high_school_european_history_helm = LightevalTaskConfig( name="mmlu:high_school_european_history", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="high_school_european_history", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12043,7 +12044,7 @@ mmlu_high_school_geography_original = LightevalTaskConfig( name="mmlu:high_school_geography", suite=["original", "mmlu"], - prompt_function="mmlu_high_school_geography", + prompt_function=prompt.mmlu_high_school_geography, hf_repo="cais/mmlu", hf_subset="high_school_geography", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12061,7 +12062,7 @@ mmlu_high_school_geography_leaderboard = LightevalTaskConfig( name="mmlu:high_school_geography", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="high_school_geography", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12079,7 +12080,7 @@ mmlu_high_school_geography_helm = LightevalTaskConfig( name="mmlu:high_school_geography", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="high_school_geography", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12097,7 +12098,7 @@ mmlu_high_school_government_and_politics_original = LightevalTaskConfig( name="mmlu:high_school_government_and_politics", suite=["original", "mmlu"], - prompt_function="mmlu_high_school_government_and_politics", + prompt_function=prompt.mmlu_high_school_government_and_politics, hf_repo="cais/mmlu", hf_subset="high_school_government_and_politics", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12115,7 +12116,7 @@ mmlu_high_school_government_and_politics_leaderboard = LightevalTaskConfig( name="mmlu:high_school_government_and_politics", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="high_school_government_and_politics", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12133,7 +12134,7 @@ mmlu_high_school_government_and_politics_helm = LightevalTaskConfig( name="mmlu:high_school_government_and_politics", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="high_school_government_and_politics", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12151,7 +12152,7 @@ mmlu_high_school_macroeconomics_original = LightevalTaskConfig( name="mmlu:high_school_macroeconomics", suite=["original", "mmlu"], - prompt_function="mmlu_high_school_macroeconomics", + prompt_function=prompt.mmlu_high_school_macroeconomics, hf_repo="cais/mmlu", hf_subset="high_school_macroeconomics", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12169,7 +12170,7 @@ mmlu_high_school_macroeconomics_leaderboard = LightevalTaskConfig( name="mmlu:high_school_macroeconomics", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="high_school_macroeconomics", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12187,7 +12188,7 @@ mmlu_high_school_macroeconomics_helm = LightevalTaskConfig( name="mmlu:high_school_macroeconomics", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="high_school_macroeconomics", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12205,7 +12206,7 @@ mmlu_high_school_mathematics_original = LightevalTaskConfig( name="mmlu:high_school_mathematics", suite=["original", "mmlu"], - prompt_function="mmlu_high_school_mathematics", + prompt_function=prompt.mmlu_high_school_mathematics, hf_repo="cais/mmlu", hf_subset="high_school_mathematics", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12223,7 +12224,7 @@ mmlu_high_school_mathematics_leaderboard = LightevalTaskConfig( name="mmlu:high_school_mathematics", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="high_school_mathematics", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12241,7 +12242,7 @@ mmlu_high_school_mathematics_helm = LightevalTaskConfig( name="mmlu:high_school_mathematics", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="high_school_mathematics", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12259,7 +12260,7 @@ mmlu_high_school_microeconomics_original = LightevalTaskConfig( name="mmlu:high_school_microeconomics", suite=["original", "mmlu"], - prompt_function="mmlu_high_school_microeconomics", + prompt_function=prompt.mmlu_high_school_microeconomics, hf_repo="cais/mmlu", hf_subset="high_school_microeconomics", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12277,7 +12278,7 @@ mmlu_high_school_microeconomics_leaderboard = LightevalTaskConfig( name="mmlu:high_school_microeconomics", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="high_school_microeconomics", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12295,7 +12296,7 @@ mmlu_high_school_microeconomics_helm = LightevalTaskConfig( name="mmlu:high_school_microeconomics", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="high_school_microeconomics", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12313,7 +12314,7 @@ mmlu_high_school_physics_original = LightevalTaskConfig( name="mmlu:high_school_physics", suite=["original", "mmlu"], - prompt_function="mmlu_high_school_physics", + prompt_function=prompt.mmlu_high_school_physics, hf_repo="cais/mmlu", hf_subset="high_school_physics", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12331,7 +12332,7 @@ mmlu_high_school_physics_leaderboard = LightevalTaskConfig( name="mmlu:high_school_physics", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="high_school_physics", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12349,7 +12350,7 @@ mmlu_high_school_physics_helm = LightevalTaskConfig( name="mmlu:high_school_physics", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="high_school_physics", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12367,7 +12368,7 @@ mmlu_high_school_psychology_original = LightevalTaskConfig( name="mmlu:high_school_psychology", suite=["original", "mmlu"], - prompt_function="mmlu_high_school_psychology", + prompt_function=prompt.mmlu_high_school_psychology, hf_repo="cais/mmlu", hf_subset="high_school_psychology", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12385,7 +12386,7 @@ mmlu_high_school_psychology_leaderboard = LightevalTaskConfig( name="mmlu:high_school_psychology", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="high_school_psychology", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12403,7 +12404,7 @@ mmlu_high_school_psychology_helm = LightevalTaskConfig( name="mmlu:high_school_psychology", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="high_school_psychology", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12421,7 +12422,7 @@ mmlu_high_school_statistics_original = LightevalTaskConfig( name="mmlu:high_school_statistics", suite=["original", "mmlu"], - prompt_function="mmlu_high_school_statistics", + prompt_function=prompt.mmlu_high_school_statistics, hf_repo="cais/mmlu", hf_subset="high_school_statistics", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12439,7 +12440,7 @@ mmlu_high_school_statistics_leaderboard = LightevalTaskConfig( name="mmlu:high_school_statistics", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="high_school_statistics", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12457,7 +12458,7 @@ mmlu_high_school_statistics_helm = LightevalTaskConfig( name="mmlu:high_school_statistics", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="high_school_statistics", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12475,7 +12476,7 @@ mmlu_high_school_us_history_original = LightevalTaskConfig( name="mmlu:high_school_us_history", suite=["original", "mmlu"], - prompt_function="mmlu_high_school_us_history", + prompt_function=prompt.mmlu_high_school_us_history, hf_repo="cais/mmlu", hf_subset="high_school_us_history", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12493,7 +12494,7 @@ mmlu_high_school_us_history_leaderboard = LightevalTaskConfig( name="mmlu:high_school_us_history", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="high_school_us_history", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12511,7 +12512,7 @@ mmlu_high_school_us_history_helm = LightevalTaskConfig( name="mmlu:high_school_us_history", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="high_school_us_history", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12529,7 +12530,7 @@ mmlu_high_school_world_history_original = LightevalTaskConfig( name="mmlu:high_school_world_history", suite=["original", "mmlu"], - prompt_function="mmlu_high_school_world_history", + prompt_function=prompt.mmlu_high_school_world_history, hf_repo="cais/mmlu", hf_subset="high_school_world_history", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12547,7 +12548,7 @@ mmlu_high_school_world_history_leaderboard = LightevalTaskConfig( name="mmlu:high_school_world_history", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="high_school_world_history", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12565,7 +12566,7 @@ mmlu_high_school_world_history_helm = LightevalTaskConfig( name="mmlu:high_school_world_history", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="high_school_world_history", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12583,7 +12584,7 @@ mmlu_human_aging_original = LightevalTaskConfig( name="mmlu:human_aging", suite=["original", "mmlu"], - prompt_function="mmlu_human_aging", + prompt_function=prompt.mmlu_human_aging, hf_repo="cais/mmlu", hf_subset="human_aging", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12601,7 +12602,7 @@ mmlu_human_aging_leaderboard = LightevalTaskConfig( name="mmlu:human_aging", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="human_aging", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12619,7 +12620,7 @@ mmlu_human_aging_helm = LightevalTaskConfig( name="mmlu:human_aging", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="human_aging", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12637,7 +12638,7 @@ mmlu_human_sexuality_original = LightevalTaskConfig( name="mmlu:human_sexuality", suite=["original", "mmlu"], - prompt_function="mmlu_human_sexuality", + prompt_function=prompt.mmlu_human_sexuality, hf_repo="cais/mmlu", hf_subset="human_sexuality", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12655,7 +12656,7 @@ mmlu_human_sexuality_leaderboard = LightevalTaskConfig( name="mmlu:human_sexuality", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="human_sexuality", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12673,7 +12674,7 @@ mmlu_human_sexuality_helm = LightevalTaskConfig( name="mmlu:human_sexuality", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="human_sexuality", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12691,7 +12692,7 @@ mmlu_international_law_original = LightevalTaskConfig( name="mmlu:international_law", suite=["original", "mmlu"], - prompt_function="mmlu_international_law", + prompt_function=prompt.mmlu_international_law, hf_repo="cais/mmlu", hf_subset="international_law", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12709,7 +12710,7 @@ mmlu_international_law_leaderboard = LightevalTaskConfig( name="mmlu:international_law", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="international_law", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12727,7 +12728,7 @@ mmlu_international_law_helm = LightevalTaskConfig( name="mmlu:international_law", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="international_law", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12745,7 +12746,7 @@ mmlu_jurisprudence_original = LightevalTaskConfig( name="mmlu:jurisprudence", suite=["original", "mmlu"], - prompt_function="mmlu_jurisprudence", + prompt_function=prompt.mmlu_jurisprudence, hf_repo="cais/mmlu", hf_subset="jurisprudence", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12763,7 +12764,7 @@ mmlu_jurisprudence_leaderboard = LightevalTaskConfig( name="mmlu:jurisprudence", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="jurisprudence", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12781,7 +12782,7 @@ mmlu_jurisprudence_helm = LightevalTaskConfig( name="mmlu:jurisprudence", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="jurisprudence", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12799,7 +12800,7 @@ mmlu_logical_fallacies_original = LightevalTaskConfig( name="mmlu:logical_fallacies", suite=["original", "mmlu"], - prompt_function="mmlu_logical_fallacies", + prompt_function=prompt.mmlu_logical_fallacies, hf_repo="cais/mmlu", hf_subset="logical_fallacies", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12817,7 +12818,7 @@ mmlu_logical_fallacies_leaderboard = LightevalTaskConfig( name="mmlu:logical_fallacies", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="logical_fallacies", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12835,7 +12836,7 @@ mmlu_logical_fallacies_helm = LightevalTaskConfig( name="mmlu:logical_fallacies", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="logical_fallacies", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12853,7 +12854,7 @@ mmlu_machine_learning_original = LightevalTaskConfig( name="mmlu:machine_learning", suite=["original", "mmlu"], - prompt_function="mmlu_machine_learning", + prompt_function=prompt.mmlu_machine_learning, hf_repo="cais/mmlu", hf_subset="machine_learning", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12871,7 +12872,7 @@ mmlu_machine_learning_leaderboard = LightevalTaskConfig( name="mmlu:machine_learning", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="machine_learning", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12889,7 +12890,7 @@ mmlu_machine_learning_helm = LightevalTaskConfig( name="mmlu:machine_learning", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="machine_learning", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12907,7 +12908,7 @@ mmlu_management_original = LightevalTaskConfig( name="mmlu:management", suite=["original", "mmlu"], - prompt_function="mmlu_management", + prompt_function=prompt.mmlu_management, hf_repo="cais/mmlu", hf_subset="management", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12925,7 +12926,7 @@ mmlu_management_leaderboard = LightevalTaskConfig( name="mmlu:management", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="management", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12943,7 +12944,7 @@ mmlu_management_helm = LightevalTaskConfig( name="mmlu:management", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="management", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12961,7 +12962,7 @@ mmlu_marketing_original = LightevalTaskConfig( name="mmlu:marketing", suite=["original", "mmlu"], - prompt_function="mmlu_marketing", + prompt_function=prompt.mmlu_marketing, hf_repo="cais/mmlu", hf_subset="marketing", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12979,7 +12980,7 @@ mmlu_marketing_leaderboard = LightevalTaskConfig( name="mmlu:marketing", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="marketing", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -12997,7 +12998,7 @@ mmlu_marketing_helm = LightevalTaskConfig( name="mmlu:marketing", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="marketing", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13015,7 +13016,7 @@ mmlu_medical_genetics_original = LightevalTaskConfig( name="mmlu:medical_genetics", suite=["original", "mmlu"], - prompt_function="mmlu_medical_genetics", + prompt_function=prompt.mmlu_medical_genetics, hf_repo="cais/mmlu", hf_subset="medical_genetics", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13033,7 +13034,7 @@ mmlu_medical_genetics_leaderboard = LightevalTaskConfig( name="mmlu:medical_genetics", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="medical_genetics", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13051,7 +13052,7 @@ mmlu_medical_genetics_helm = LightevalTaskConfig( name="mmlu:medical_genetics", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="medical_genetics", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13069,7 +13070,7 @@ mmlu_miscellaneous_original = LightevalTaskConfig( name="mmlu:miscellaneous", suite=["original", "mmlu"], - prompt_function="mmlu_miscellaneous", + prompt_function=prompt.mmlu_miscellaneous, hf_repo="cais/mmlu", hf_subset="miscellaneous", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13087,7 +13088,7 @@ mmlu_miscellaneous_leaderboard = LightevalTaskConfig( name="mmlu:miscellaneous", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="miscellaneous", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13105,7 +13106,7 @@ mmlu_miscellaneous_helm = LightevalTaskConfig( name="mmlu:miscellaneous", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="miscellaneous", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13123,7 +13124,7 @@ mmlu_moral_disputes_original = LightevalTaskConfig( name="mmlu:moral_disputes", suite=["original", "mmlu"], - prompt_function="mmlu_moral_disputes", + prompt_function=prompt.mmlu_moral_disputes, hf_repo="cais/mmlu", hf_subset="moral_disputes", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13141,7 +13142,7 @@ mmlu_moral_disputes_leaderboard = LightevalTaskConfig( name="mmlu:moral_disputes", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="moral_disputes", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13159,7 +13160,7 @@ mmlu_moral_disputes_helm = LightevalTaskConfig( name="mmlu:moral_disputes", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="moral_disputes", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13177,7 +13178,7 @@ mmlu_moral_scenarios_original = LightevalTaskConfig( name="mmlu:moral_scenarios", suite=["original", "mmlu"], - prompt_function="mmlu_moral_scenarios", + prompt_function=prompt.mmlu_moral_scenarios, hf_repo="cais/mmlu", hf_subset="moral_scenarios", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13195,7 +13196,7 @@ mmlu_moral_scenarios_leaderboard = LightevalTaskConfig( name="mmlu:moral_scenarios", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="moral_scenarios", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13213,7 +13214,7 @@ mmlu_moral_scenarios_helm = LightevalTaskConfig( name="mmlu:moral_scenarios", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="moral_scenarios", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13231,7 +13232,7 @@ mmlu_nutrition_original = LightevalTaskConfig( name="mmlu:nutrition", suite=["original", "mmlu"], - prompt_function="mmlu_nutrition", + prompt_function=prompt.mmlu_nutrition, hf_repo="cais/mmlu", hf_subset="nutrition", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13249,7 +13250,7 @@ mmlu_nutrition_leaderboard = LightevalTaskConfig( name="mmlu:nutrition", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="nutrition", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13267,7 +13268,7 @@ mmlu_nutrition_helm = LightevalTaskConfig( name="mmlu:nutrition", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="nutrition", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13285,7 +13286,7 @@ mmlu_philosophy_original = LightevalTaskConfig( name="mmlu:philosophy", suite=["original", "mmlu"], - prompt_function="mmlu_philosophy", + prompt_function=prompt.mmlu_philosophy, hf_repo="cais/mmlu", hf_subset="philosophy", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13303,7 +13304,7 @@ mmlu_philosophy_leaderboard = LightevalTaskConfig( name="mmlu:philosophy", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="philosophy", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13321,7 +13322,7 @@ mmlu_philosophy_helm = LightevalTaskConfig( name="mmlu:philosophy", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="philosophy", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13339,7 +13340,7 @@ mmlu_prehistory_original = LightevalTaskConfig( name="mmlu:prehistory", suite=["original", "mmlu"], - prompt_function="mmlu_prehistory", + prompt_function=prompt.mmlu_prehistory, hf_repo="cais/mmlu", hf_subset="prehistory", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13357,7 +13358,7 @@ mmlu_prehistory_leaderboard = LightevalTaskConfig( name="mmlu:prehistory", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="prehistory", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13375,7 +13376,7 @@ mmlu_prehistory_helm = LightevalTaskConfig( name="mmlu:prehistory", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="prehistory", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13393,7 +13394,7 @@ mmlu_professional_accounting_original = LightevalTaskConfig( name="mmlu:professional_accounting", suite=["original", "mmlu"], - prompt_function="mmlu_professional_accounting", + prompt_function=prompt.mmlu_professional_accounting, hf_repo="cais/mmlu", hf_subset="professional_accounting", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13411,7 +13412,7 @@ mmlu_professional_accounting_leaderboard = LightevalTaskConfig( name="mmlu:professional_accounting", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="professional_accounting", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13429,7 +13430,7 @@ mmlu_professional_accounting_helm = LightevalTaskConfig( name="mmlu:professional_accounting", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="professional_accounting", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13447,7 +13448,7 @@ mmlu_professional_law_original = LightevalTaskConfig( name="mmlu:professional_law", suite=["original", "mmlu"], - prompt_function="mmlu_professional_law", + prompt_function=prompt.mmlu_professional_law, hf_repo="cais/mmlu", hf_subset="professional_law", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13465,7 +13466,7 @@ mmlu_professional_law_leaderboard = LightevalTaskConfig( name="mmlu:professional_law", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="professional_law", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13483,7 +13484,7 @@ mmlu_professional_law_helm = LightevalTaskConfig( name="mmlu:professional_law", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="professional_law", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13501,7 +13502,7 @@ mmlu_professional_medicine_original = LightevalTaskConfig( name="mmlu:professional_medicine", suite=["original", "mmlu"], - prompt_function="mmlu_professional_medicine", + prompt_function=prompt.mmlu_professional_medicine, hf_repo="cais/mmlu", hf_subset="professional_medicine", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13519,7 +13520,7 @@ mmlu_professional_medicine_leaderboard = LightevalTaskConfig( name="mmlu:professional_medicine", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="professional_medicine", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13537,7 +13538,7 @@ mmlu_professional_medicine_helm = LightevalTaskConfig( name="mmlu:professional_medicine", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="professional_medicine", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13555,7 +13556,7 @@ mmlu_professional_psychology_original = LightevalTaskConfig( name="mmlu:professional_psychology", suite=["original", "mmlu"], - prompt_function="mmlu_professional_psychology", + prompt_function=prompt.mmlu_professional_psychology, hf_repo="cais/mmlu", hf_subset="professional_psychology", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13573,7 +13574,7 @@ mmlu_professional_psychology_leaderboard = LightevalTaskConfig( name="mmlu:professional_psychology", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="professional_psychology", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13591,7 +13592,7 @@ mmlu_professional_psychology_helm = LightevalTaskConfig( name="mmlu:professional_psychology", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="professional_psychology", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13609,7 +13610,7 @@ mmlu_public_relations_original = LightevalTaskConfig( name="mmlu:public_relations", suite=["original", "mmlu"], - prompt_function="mmlu_public_relations", + prompt_function=prompt.mmlu_public_relations, hf_repo="cais/mmlu", hf_subset="public_relations", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13627,7 +13628,7 @@ mmlu_public_relations_leaderboard = LightevalTaskConfig( name="mmlu:public_relations", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="public_relations", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13645,7 +13646,7 @@ mmlu_public_relations_helm = LightevalTaskConfig( name="mmlu:public_relations", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="public_relations", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13663,7 +13664,7 @@ mmlu_security_studies_original = LightevalTaskConfig( name="mmlu:security_studies", suite=["original", "mmlu"], - prompt_function="mmlu_security_studies", + prompt_function=prompt.mmlu_security_studies, hf_repo="cais/mmlu", hf_subset="security_studies", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13681,7 +13682,7 @@ mmlu_security_studies_leaderboard = LightevalTaskConfig( name="mmlu:security_studies", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="security_studies", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13699,7 +13700,7 @@ mmlu_security_studies_helm = LightevalTaskConfig( name="mmlu:security_studies", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="security_studies", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13717,7 +13718,7 @@ mmlu_sociology_original = LightevalTaskConfig( name="mmlu:sociology", suite=["original", "mmlu"], - prompt_function="mmlu_sociology", + prompt_function=prompt.mmlu_sociology, hf_repo="cais/mmlu", hf_subset="sociology", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13735,7 +13736,7 @@ mmlu_sociology_leaderboard = LightevalTaskConfig( name="mmlu:sociology", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="sociology", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13753,7 +13754,7 @@ mmlu_sociology_helm = LightevalTaskConfig( name="mmlu:sociology", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="sociology", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13771,7 +13772,7 @@ mmlu_us_foreign_policy_original = LightevalTaskConfig( name="mmlu:us_foreign_policy", suite=["original", "mmlu"], - prompt_function="mmlu_us_foreign_policy", + prompt_function=prompt.mmlu_us_foreign_policy, hf_repo="cais/mmlu", hf_subset="us_foreign_policy", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13789,7 +13790,7 @@ mmlu_us_foreign_policy_leaderboard = LightevalTaskConfig( name="mmlu:us_foreign_policy", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="us_foreign_policy", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13807,7 +13808,7 @@ mmlu_us_foreign_policy_helm = LightevalTaskConfig( name="mmlu:us_foreign_policy", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="us_foreign_policy", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13825,7 +13826,7 @@ mmlu_virology_original = LightevalTaskConfig( name="mmlu:virology", suite=["original", "mmlu"], - prompt_function="mmlu_virology", + prompt_function=prompt.mmlu_virology, hf_repo="cais/mmlu", hf_subset="virology", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13843,7 +13844,7 @@ mmlu_virology_leaderboard = LightevalTaskConfig( name="mmlu:virology", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="virology", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13861,7 +13862,7 @@ mmlu_virology_helm = LightevalTaskConfig( name="mmlu:virology", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="virology", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13879,7 +13880,7 @@ mmlu_world_religions_original = LightevalTaskConfig( name="mmlu:world_religions", suite=["original", "mmlu"], - prompt_function="mmlu_world_religions", + prompt_function=prompt.mmlu_world_religions, hf_repo="cais/mmlu", hf_subset="world_religions", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13897,7 +13898,7 @@ mmlu_world_religions_leaderboard = LightevalTaskConfig( name="mmlu:world_religions", suite=["leaderboard", "mmlu"], - prompt_function="mmlu_harness", + prompt_function=prompt.mmlu_harness, hf_repo="lighteval/mmlu", hf_subset="world_religions", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13915,7 +13916,7 @@ mmlu_world_religions_helm = LightevalTaskConfig( name="mmlu:world_religions", suite=["helm", "helm_general"], - prompt_function="mmlu_helm", + prompt_function=prompt.mmlu_helm, hf_repo="lighteval/mmlu", hf_subset="world_religions", hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], @@ -13933,7 +13934,7 @@ mnist_ascii_bigbench = LightevalTaskConfig( name="mnist_ascii", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="mnist_ascii", hf_avail_splits=["default", "train", "validation"], @@ -13951,7 +13952,7 @@ modified_arithmetic_bigbench = LightevalTaskConfig( name="modified_arithmetic", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="modified_arithmetic", hf_avail_splits=["default", "train", "validation"], @@ -13969,7 +13970,7 @@ moral_permissibility_bigbench = LightevalTaskConfig( name="moral_permissibility", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="moral_permissibility", hf_avail_splits=["default", "train", "validation"], @@ -13987,7 +13988,7 @@ movie_dialog_same_or_different_bigbench = LightevalTaskConfig( name="movie_dialog_same_or_different", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="movie_dialog_same_or_different", hf_avail_splits=["default", "train", "validation"], @@ -14005,7 +14006,7 @@ movie_recommendation_bigbench = LightevalTaskConfig( name="movie_recommendation", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="movie_recommendation", hf_avail_splits=["default", "train", "validation"], @@ -14023,7 +14024,7 @@ mtnt2019_en_fr_lighteval = LightevalTaskConfig( name="mtnt2019:en-fr", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="mtnt2019_en-fr", hf_avail_splits=["test"], @@ -14041,7 +14042,7 @@ mtnt2019_en_ja_lighteval = LightevalTaskConfig( name="mtnt2019:en-ja", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="mtnt2019_en-ja", hf_avail_splits=["test"], @@ -14059,7 +14060,7 @@ mtnt2019_fr_en_lighteval = LightevalTaskConfig( name="mtnt2019:fr-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="mtnt2019_fr-en", hf_avail_splits=["test"], @@ -14077,7 +14078,7 @@ mtnt2019_ja_en_lighteval = LightevalTaskConfig( name="mtnt2019:ja-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="mtnt2019_ja-en", hf_avail_splits=["test"], @@ -14095,7 +14096,7 @@ mult_data_wrangling_bigbench = LightevalTaskConfig( name="mult_data_wrangling", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="mult_data_wrangling", hf_avail_splits=["default", "train", "validation"], @@ -14113,7 +14114,7 @@ multiemo_bigbench = LightevalTaskConfig( name="multiemo", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="multiemo", hf_avail_splits=["default", "train", "validation"], @@ -14131,7 +14132,7 @@ mutual_lighteval = LightevalTaskConfig( name="mutual", suite=["lighteval"], - prompt_function="mutual", + prompt_function=prompt.mutual, hf_repo="lighteval/mutual_harness", hf_subset="mutual", hf_avail_splits=["train", "validation"], @@ -14149,7 +14150,7 @@ mutual_plus_lighteval = LightevalTaskConfig( name="mutual_plus", suite=["lighteval"], - prompt_function="mutual", + prompt_function=prompt.mutual, hf_repo="lighteval/mutual_harness", hf_subset="mutual_plus", hf_avail_splits=["train", "validation"], @@ -14167,7 +14168,7 @@ narrativeqa_helm = LightevalTaskConfig( name="narrativeqa", suite=["helm", "helm_general"], - prompt_function="narrativeqa", + prompt_function=prompt.narrativeqa, hf_repo="lighteval/narrative_qa_helm", hf_subset="default", hf_avail_splits=["train", "test", "validation"], @@ -14185,7 +14186,7 @@ natural_instructions_bigbench = LightevalTaskConfig( name="natural_instructions", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="natural_instructions", hf_avail_splits=["default", "train", "validation"], @@ -14203,7 +14204,7 @@ navigate_bigbench = LightevalTaskConfig( name="navigate", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="navigate", hf_avail_splits=["default", "train", "validation"], @@ -14221,7 +14222,7 @@ nonsense_words_grammar_bigbench = LightevalTaskConfig( name="nonsense_words_grammar", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="nonsense_words_grammar", hf_avail_splits=["default", "train", "validation"], @@ -14239,7 +14240,7 @@ novel_concepts_bigbench_lite = LightevalTaskConfig( name="novel_concepts", suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="novel_concepts", hf_avail_splits=["default", "train", "validation"], @@ -14257,7 +14258,7 @@ numeracy_linear_example_helm = LightevalTaskConfig( name="numeracy:linear_example", suite=["helm"], - prompt_function="numeracy", + prompt_function=prompt.numeracy, hf_repo="lighteval/numeracy", hf_subset="linear_example", hf_avail_splits=["train", "test"], @@ -14275,7 +14276,7 @@ numeracy_linear_standard_helm = LightevalTaskConfig( name="numeracy:linear_standard", suite=["helm"], - prompt_function="numeracy", + prompt_function=prompt.numeracy, hf_repo="lighteval/numeracy", hf_subset="linear_standard", hf_avail_splits=["train", "test"], @@ -14293,7 +14294,7 @@ numeracy_parabola_example_helm = LightevalTaskConfig( name="numeracy:parabola_example", suite=["helm"], - prompt_function="numeracy", + prompt_function=prompt.numeracy, hf_repo="lighteval/numeracy", hf_subset="parabola_example", hf_avail_splits=["train", "test"], @@ -14311,7 +14312,7 @@ numeracy_parabola_standard_helm = LightevalTaskConfig( name="numeracy:parabola_standard", suite=["helm"], - prompt_function="numeracy", + prompt_function=prompt.numeracy, hf_repo="lighteval/numeracy", hf_subset="parabola_standard", hf_avail_splits=["train", "test"], @@ -14329,7 +14330,7 @@ numeracy_paraboloid_example_helm = LightevalTaskConfig( name="numeracy:paraboloid_example", suite=["helm"], - prompt_function="numeracy", + prompt_function=prompt.numeracy, hf_repo="lighteval/numeracy", hf_subset="paraboloid_example", hf_avail_splits=["train", "test"], @@ -14347,7 +14348,7 @@ numeracy_paraboloid_standard_helm = LightevalTaskConfig( name="numeracy:paraboloid_standard", suite=["helm"], - prompt_function="numeracy", + prompt_function=prompt.numeracy, hf_repo="lighteval/numeracy", hf_subset="paraboloid_standard", hf_avail_splits=["train", "test"], @@ -14365,7 +14366,7 @@ numeracy_plane_example_helm = LightevalTaskConfig( name="numeracy:plane_example", suite=["helm"], - prompt_function="numeracy", + prompt_function=prompt.numeracy, hf_repo="lighteval/numeracy", hf_subset="plane_example", hf_avail_splits=["train", "test"], @@ -14383,7 +14384,7 @@ numeracy_plane_standard_helm = LightevalTaskConfig( name="numeracy:plane_standard", suite=["helm"], - prompt_function="numeracy", + prompt_function=prompt.numeracy, hf_repo="lighteval/numeracy", hf_subset="plane_standard", hf_avail_splits=["train", "test"], @@ -14401,7 +14402,7 @@ object_counting_bigbench = LightevalTaskConfig( name="object_counting", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="object_counting", hf_avail_splits=["default", "train", "validation"], @@ -14419,7 +14420,7 @@ odd_one_out_bigbench = LightevalTaskConfig( name="odd_one_out", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="odd_one_out", hf_avail_splits=["default", "train", "validation"], @@ -14437,7 +14438,7 @@ openbookqa_helm = LightevalTaskConfig( name="openbookqa", suite=["helm", "commonsense_scenario", "helm_general"], - prompt_function="openbookqa_helm", + prompt_function=prompt.openbookqa_helm, hf_repo="openbookqa", hf_subset="main", hf_avail_splits=["train", "test", "validation"], @@ -14455,7 +14456,7 @@ openbookqa_lighteval = LightevalTaskConfig( name="openbookqa", suite=["lighteval"], - prompt_function="openbookqa", + prompt_function=prompt.openbookqa, hf_repo="openbookqa", hf_subset="main", hf_avail_splits=["train", "test", "validation"], @@ -14473,7 +14474,7 @@ operators_bigbench_lite = LightevalTaskConfig( name="operators", suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function="bigbench_whitespace_after_query", + prompt_function=prompt.bigbench_whitespace_after_query, hf_repo="bigbench", hf_subset="operators", hf_avail_splits=["default", "train", "validation"], @@ -14490,7 +14491,7 @@ paragraph_segmentation_bigbench = LightevalTaskConfig( name="paragraph_segmentation", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="paragraph_segmentation", hf_avail_splits=["default", "train", "validation"], @@ -14508,7 +14509,7 @@ parsinlu_qa_bigbench = LightevalTaskConfig( name="parsinlu_qa", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="parsinlu_qa", hf_avail_splits=["default", "train", "validation"], @@ -14526,7 +14527,7 @@ parsinlu_reading_comprehension_bigbench_lite = LightevalTaskConfig( name="parsinlu_reading_comprehension", suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function="bigbench_linefeed_before_whitespace_after_query", + prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query, hf_repo="bigbench", hf_subset="parsinlu_reading_comprehension", hf_avail_splits=["default", "train", "validation"], @@ -14543,7 +14544,7 @@ penguins_in_a_table_bigbench = LightevalTaskConfig( name="penguins_in_a_table", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="penguins_in_a_table", hf_avail_splits=["default", "train", "validation"], @@ -14561,7 +14562,7 @@ periodic_elements_bigbench = LightevalTaskConfig( name="periodic_elements", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="periodic_elements", hf_avail_splits=["default", "train", "validation"], @@ -14579,7 +14580,7 @@ persian_idioms_bigbench = LightevalTaskConfig( name="persian_idioms", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="persian_idioms", hf_avail_splits=["default", "train", "validation"], @@ -14597,7 +14598,7 @@ phrase_relatedness_bigbench = LightevalTaskConfig( name="phrase_relatedness", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="phrase_relatedness", hf_avail_splits=["default", "train", "validation"], @@ -14615,7 +14616,7 @@ physical_intuition_bigbench = LightevalTaskConfig( name="physical_intuition", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="physical_intuition", hf_avail_splits=["default", "train", "validation"], @@ -14633,7 +14634,7 @@ physics_bigbench = LightevalTaskConfig( name="physics", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="physics", hf_avail_splits=["default", "train", "validation"], @@ -14651,7 +14652,7 @@ physics_questions_bigbench = LightevalTaskConfig( name="physics_questions", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="physics_questions", hf_avail_splits=["default", "train", "validation"], @@ -14669,7 +14670,7 @@ piqa_lighteval = LightevalTaskConfig( name="piqa", suite=["lighteval"], - prompt_function="piqa_harness", + prompt_function=prompt.piqa_harness, hf_repo="piqa", hf_subset="plain_text", hf_avail_splits=["train", "test", "validation"], @@ -14687,7 +14688,7 @@ piqa_helm = LightevalTaskConfig( name="piqa", suite=["helm", "commonsense_scenario"], - prompt_function="piqa_helm", + prompt_function=prompt.piqa_helm, hf_repo="piqa", hf_subset="plain_text", hf_avail_splits=["train", "test", "validation"], @@ -14705,7 +14706,7 @@ play_dialog_same_or_different_bigbench_lite = LightevalTaskConfig( name="play_dialog_same_or_different", suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function="bigbench_linefeed_before_whitespace_after_query", + prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query, hf_repo="bigbench", hf_subset="play_dialog_same_or_different", hf_avail_splits=["default", "train", "validation"], @@ -14723,7 +14724,7 @@ polish_sequence_labeling_bigbench = LightevalTaskConfig( name="polish_sequence_labeling", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="polish_sequence_labeling", hf_avail_splits=["default", "train", "validation"], @@ -14741,7 +14742,7 @@ presuppositions_as_nli_bigbench = LightevalTaskConfig( name="presuppositions_as_nli", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="presuppositions_as_nli", hf_avail_splits=["default", "train", "validation"], @@ -14759,7 +14760,7 @@ prost_lighteval = LightevalTaskConfig( name="prost", suite=["lighteval"], - prompt_function="prost", + prompt_function=prompt.prost, hf_repo="corypaik/prost", hf_subset="default", hf_avail_splits=["test"], @@ -14777,7 +14778,7 @@ pubmedqa_lighteval = LightevalTaskConfig( name="pubmedqa", suite=["lighteval"], - prompt_function="pubmed_qa", + prompt_function=prompt.pubmed_qa, hf_repo="pubmed_qa", hf_subset="pqa_labeled", hf_avail_splits=["train"], @@ -14795,7 +14796,7 @@ pubmedqa_helm = LightevalTaskConfig( name="pubmedqa", suite=["helm"], - prompt_function="pubmed_qa_helm", + prompt_function=prompt.pubmed_qa_helm, hf_repo="pubmed_qa", hf_subset="pqa_labeled", hf_avail_splits=["train"], @@ -14813,7 +14814,7 @@ qa4mre_2011_lighteval = LightevalTaskConfig( name="qa4mre:2011", suite=["lighteval"], - prompt_function="qa4mre", + prompt_function=prompt.qa4mre, hf_repo="qa4mre", hf_subset="2011.main.EN", hf_avail_splits=["train"], @@ -14831,7 +14832,7 @@ qa4mre_2012_lighteval = LightevalTaskConfig( name="qa4mre:2012", suite=["lighteval"], - prompt_function="qa4mre", + prompt_function=prompt.qa4mre, hf_repo="qa4mre", hf_subset="2012.main.EN", hf_avail_splits=["train"], @@ -14849,7 +14850,7 @@ qa4mre_2013_lighteval = LightevalTaskConfig( name="qa4mre:2013", suite=["lighteval"], - prompt_function="qa4mre", + prompt_function=prompt.qa4mre, hf_repo="qa4mre", hf_subset="2013.main.EN", hf_avail_splits=["train"], @@ -14867,7 +14868,7 @@ qa_wikidata_bigbench = LightevalTaskConfig( name="qa_wikidata", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="qa_wikidata", hf_avail_splits=["default", "train", "validation"], @@ -14885,7 +14886,7 @@ qasper_lighteval = LightevalTaskConfig( name="qasper", suite=["lighteval"], - prompt_function="qasper", + prompt_function=prompt.qasper, hf_repo="qasper", hf_subset="qasper", hf_avail_splits=["train", "validation"], @@ -14903,7 +14904,7 @@ qasper_ll_lighteval = LightevalTaskConfig( name="qasper_ll", suite=["lighteval"], - prompt_function="qasper_ll", + prompt_function=prompt.qasper_ll, hf_repo="qasper", hf_subset="qasper", hf_avail_splits=["train", "validation"], @@ -14921,7 +14922,7 @@ quac_helm = LightevalTaskConfig( name="quac", suite=["helm"], - prompt_function="quac", + prompt_function=prompt.quac, hf_repo="lighteval/quac_helm", hf_subset="default", hf_avail_splits=["train", "validation"], @@ -14939,7 +14940,7 @@ question_selection_bigbench = LightevalTaskConfig( name="question_selection", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="question_selection", hf_avail_splits=["default", "train", "validation"], @@ -14957,7 +14958,7 @@ race_high_lighteval = LightevalTaskConfig( name="race:high", suite=["lighteval", "race"], - prompt_function="race", + prompt_function=prompt.race, hf_repo="EleutherAI/race", hf_subset="high", hf_avail_splits=["test"], @@ -14975,7 +14976,7 @@ raft_ade_corpus_v2_helm = LightevalTaskConfig( name="raft:ade_corpus_v2", suite=["helm", "helm_general"], - prompt_function="raft_ade_corpus_v2", + prompt_function=prompt.raft_ade_corpus_v2, hf_repo="ought/raft", hf_subset="ade_corpus_v2", hf_avail_splits=["train", "test"], @@ -15000,7 +15001,7 @@ raft_banking_77_helm = LightevalTaskConfig( name="raft:banking_77", suite=["helm", "helm_general"], - prompt_function="raft_banking_77", + prompt_function=prompt.raft_banking_77, hf_repo="ought/raft", hf_subset="banking_77", hf_avail_splits=["train", "test"], @@ -15025,7 +15026,7 @@ raft_neurips_impact_statement_risks_helm = LightevalTaskConfig( name="raft:neurips_impact_statement_risks", suite=["helm", "helm_general"], - prompt_function="raft_neurips_impact_statement_risks", + prompt_function=prompt.raft_neurips_impact_statement_risks, hf_repo="ought/raft", hf_subset="neurips_impact_statement_risks", hf_avail_splits=["train", "test"], @@ -15050,7 +15051,7 @@ raft_one_stop_english_helm = LightevalTaskConfig( name="raft:one_stop_english", suite=["helm", "helm_general"], - prompt_function="raft_one_stop_english", + prompt_function=prompt.raft_one_stop_english, hf_repo="ought/raft", hf_subset="one_stop_english", hf_avail_splits=["train", "test"], @@ -15075,7 +15076,7 @@ raft_overruling_helm = LightevalTaskConfig( name="raft:overruling", suite=["helm", "helm_general"], - prompt_function="raft_overruling", + prompt_function=prompt.raft_overruling, hf_repo="ought/raft", hf_subset="overruling", hf_avail_splits=["train", "test"], @@ -15100,7 +15101,7 @@ raft_semiconductor_org_types_helm = LightevalTaskConfig( name="raft:semiconductor_org_types", suite=["helm", "helm_general"], - prompt_function="raft_semiconductor_org_types", + prompt_function=prompt.raft_semiconductor_org_types, hf_repo="ought/raft", hf_subset="semiconductor_org_types", hf_avail_splits=["train", "test"], @@ -15125,7 +15126,7 @@ raft_systematic_review_inclusion_helm = LightevalTaskConfig( name="raft:systematic_review_inclusion", suite=["helm", "helm_general"], - prompt_function="raft_systematic_review_inclusion", + prompt_function=prompt.raft_systematic_review_inclusion, hf_repo="ought/raft", hf_subset="systematic_review_inclusion", hf_avail_splits=["train", "test"], @@ -15150,7 +15151,7 @@ raft_tai_safety_research_helm = LightevalTaskConfig( name="raft:tai_safety_research", suite=["helm", "helm_general"], - prompt_function="raft_tai_safety_research", + prompt_function=prompt.raft_tai_safety_research, hf_repo="ought/raft", hf_subset="tai_safety_research", hf_avail_splits=["train", "test"], @@ -15175,7 +15176,7 @@ raft_terms_of_service_helm = LightevalTaskConfig( name="raft:terms_of_service", suite=["helm", "helm_general"], - prompt_function="raft_terms_of_service", + prompt_function=prompt.raft_terms_of_service, hf_repo="ought/raft", hf_subset="terms_of_service", hf_avail_splits=["train", "test"], @@ -15200,7 +15201,7 @@ raft_tweet_eval_hate_helm = LightevalTaskConfig( name="raft:tweet_eval_hate", suite=["helm", "helm_general"], - prompt_function="raft_tweet_eval_hate", + prompt_function=prompt.raft_tweet_eval_hate, hf_repo="ought/raft", hf_subset="tweet_eval_hate", hf_avail_splits=["train", "test"], @@ -15225,7 +15226,7 @@ raft_twitter_complaints_helm = LightevalTaskConfig( name="raft:twitter_complaints", suite=["helm", "helm_general"], - prompt_function="raft_twitter_complaints", + prompt_function=prompt.raft_twitter_complaints, hf_repo="ought/raft", hf_subset="twitter_complaints", hf_avail_splits=["train", "test"], @@ -15250,7 +15251,7 @@ real_or_fake_text_bigbench = LightevalTaskConfig( name="real_or_fake_text", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="real_or_fake_text", hf_avail_splits=["default", "train", "validation"], @@ -15268,7 +15269,7 @@ real_toxicity_prompts_helm = LightevalTaskConfig( name="real_toxicity_prompts", suite=["helm"], - prompt_function="real_toxicity_prompts", + prompt_function=prompt.real_toxicity_prompts, hf_repo="allenai/real-toxicity-prompts", hf_subset="default", hf_avail_splits=["train"], @@ -15286,7 +15287,7 @@ reasoning_about_colored_objects_bigbench = LightevalTaskConfig( name="reasoning_about_colored_objects", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="reasoning_about_colored_objects", hf_avail_splits=["default", "train", "validation"], @@ -15304,7 +15305,7 @@ repeat_copy_logic_bigbench_lite = LightevalTaskConfig( name="repeat_copy_logic", suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function="bigbench_whitespace_after_query", + prompt_function=prompt.bigbench_whitespace_after_query, hf_repo="bigbench", hf_subset="repeat_copy_logic", hf_avail_splits=["default", "train", "validation"], @@ -15322,7 +15323,7 @@ rephrase_bigbench = LightevalTaskConfig( name="rephrase", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="rephrase", hf_avail_splits=["default", "train", "validation"], @@ -15340,7 +15341,7 @@ rhyming_bigbench = LightevalTaskConfig( name="rhyming", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="rhyming", hf_avail_splits=["default", "train", "validation"], @@ -15358,7 +15359,7 @@ riddle_sense_bigbench = LightevalTaskConfig( name="riddle_sense", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="riddle_sense", hf_avail_splits=["default", "train", "validation"], @@ -15376,7 +15377,7 @@ ruin_names_bigbench = LightevalTaskConfig( name="ruin_names", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="ruin_names", hf_avail_splits=["default", "train", "validation"], @@ -15394,7 +15395,7 @@ salient_translation_error_detection_bigbench = LightevalTaskConfig( name="salient_translation_error_detection", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="salient_translation_error_detection", hf_avail_splits=["default", "train", "validation"], @@ -15412,7 +15413,7 @@ scientific_press_release_bigbench = LightevalTaskConfig( name="scientific_press_release", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="scientific_press_release", hf_avail_splits=["default", "train", "validation"], @@ -15430,7 +15431,7 @@ sciq_lighteval = LightevalTaskConfig( name="sciq", suite=["lighteval"], - prompt_function="sciq", + prompt_function=prompt.sciq, hf_repo="sciq", hf_subset="default", hf_avail_splits=["train", "validation", "test"], @@ -15448,7 +15449,7 @@ semantic_parsing_in_context_sparc_bigbench = LightevalTaskConfig( name="semantic_parsing_in_context_sparc", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="semantic_parsing_in_context_sparc", hf_avail_splits=["default", "train", "validation"], @@ -15466,7 +15467,7 @@ semantic_parsing_spider_bigbench = LightevalTaskConfig( name="semantic_parsing_spider", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="semantic_parsing_spider", hf_avail_splits=["default", "train", "validation"], @@ -15484,7 +15485,7 @@ sentence_ambiguity_bigbench = LightevalTaskConfig( name="sentence_ambiguity", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="sentence_ambiguity", hf_avail_splits=["default", "train", "validation"], @@ -15502,7 +15503,7 @@ similarities_abstraction_bigbench = LightevalTaskConfig( name="similarities_abstraction", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="similarities_abstraction", hf_avail_splits=["default", "train", "validation"], @@ -15520,7 +15521,7 @@ simp_turing_concept_bigbench = LightevalTaskConfig( name="simp_turing_concept", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="simp_turing_concept", hf_avail_splits=["default", "train", "validation"], @@ -15538,7 +15539,7 @@ simple_arithmetic_json_bigbench = LightevalTaskConfig( name="simple_arithmetic_json", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="simple_arithmetic_json", hf_avail_splits=["default", "train", "validation"], @@ -15556,7 +15557,7 @@ simple_arithmetic_json_multiple_choice_bigbench = LightevalTaskConfig( name="simple_arithmetic_json_multiple_choice", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="simple_arithmetic_json_multiple_choice", hf_avail_splits=["default", "train", "validation"], @@ -15574,7 +15575,7 @@ simple_arithmetic_json_subtasks_bigbench = LightevalTaskConfig( name="simple_arithmetic_json_subtasks", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="simple_arithmetic_json_subtasks", hf_avail_splits=["default", "train", "validation"], @@ -15592,7 +15593,7 @@ simple_arithmetic_multiple_targets_json_bigbench = LightevalTaskConfig( name="simple_arithmetic_multiple_targets_json", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="simple_arithmetic_multiple_targets_json", hf_avail_splits=["default", "train", "validation"], @@ -15610,7 +15611,7 @@ simple_ethical_questions_bigbench = LightevalTaskConfig( name="simple_ethical_questions", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="simple_ethical_questions", hf_avail_splits=["default", "train", "validation"], @@ -15628,7 +15629,7 @@ simple_text_editing_bigbench = LightevalTaskConfig( name="simple_text_editing", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="simple_text_editing", hf_avail_splits=["default", "train", "validation"], @@ -15646,7 +15647,7 @@ siqa_helm = LightevalTaskConfig( name="siqa", suite=["helm", "commonsense_scenario"], - prompt_function="siqa", + prompt_function=prompt.siqa, hf_repo="social_i_qa", hf_subset="default", hf_avail_splits=["train", "validation"], @@ -15664,7 +15665,7 @@ snarks_bigbench = LightevalTaskConfig( name="snarks", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="snarks", hf_avail_splits=["default", "train", "validation"], @@ -15682,7 +15683,7 @@ social_iqa_bigbench = LightevalTaskConfig( name="social_iqa", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="social_iqa", hf_avail_splits=["default", "train", "validation"], @@ -15700,7 +15701,7 @@ social_support_bigbench = LightevalTaskConfig( name="social_support", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="social_support", hf_avail_splits=["default", "train", "validation"], @@ -15718,7 +15719,7 @@ sports_understanding_bigbench = LightevalTaskConfig( name="sports_understanding", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="sports_understanding", hf_avail_splits=["default", "train", "validation"], @@ -15736,7 +15737,7 @@ storycloze_2016_lighteval = LightevalTaskConfig( name="storycloze:2016", suite=["lighteval", "storycloze"], - prompt_function="storycloze", + prompt_function=prompt.storycloze, hf_repo="story_cloze", hf_subset="2016", hf_avail_splits=["validation"], @@ -15754,7 +15755,7 @@ storycloze_2018_lighteval = LightevalTaskConfig( name="storycloze:2018", suite=["lighteval", "storycloze"], - prompt_function="storycloze", + prompt_function=prompt.storycloze, hf_repo="story_cloze", hf_subset="2018", hf_avail_splits=["validation"], @@ -15772,7 +15773,7 @@ strange_stories_bigbench_lite = LightevalTaskConfig( name="strange_stories", suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function="bigbench_whitespace_after_query", + prompt_function=prompt.bigbench_whitespace_after_query, hf_repo="bigbench", hf_subset="strange_stories", hf_avail_splits=["default", "train", "validation"], @@ -15790,7 +15791,7 @@ strategyqa_bigbench_lite = LightevalTaskConfig( name="strategyqa", suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function="bigbench_linefeed_before_whitespace_after_query", + prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query, hf_repo="bigbench", hf_subset="strategyqa", hf_avail_splits=["default", "train", "validation"], @@ -15808,7 +15809,7 @@ sufficient_information_bigbench = LightevalTaskConfig( name="sufficient_information", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="sufficient_information", hf_avail_splits=["default", "train", "validation"], @@ -15826,7 +15827,7 @@ suicide_risk_bigbench = LightevalTaskConfig( name="suicide_risk", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="suicide_risk", hf_avail_splits=["default", "train", "validation"], @@ -15844,7 +15845,7 @@ summarization_cnn_dm_helm = LightevalTaskConfig( name="summarization:cnn-dm", suite=["helm", "helm_general"], - prompt_function="cnn_dm", + prompt_function=prompt.cnn_dm, hf_repo="lighteval/summarization", hf_subset="cnn-dm", hf_avail_splits=["train", "test", "validation"], @@ -15862,7 +15863,7 @@ summarization_xsum_helm = LightevalTaskConfig( name="summarization:xsum", suite=["helm", "helm_general"], - prompt_function="xsum", + prompt_function=prompt.xsum, hf_repo="lighteval/summarization", hf_subset="xsum", hf_avail_splits=["train", "test", "validation"], @@ -15880,7 +15881,7 @@ summarization_xsum_sampled_helm = LightevalTaskConfig( name="summarization:xsum-sampled", suite=["helm"], - prompt_function="xsum", + prompt_function=prompt.xsum, hf_repo="lighteval/summarization", hf_subset="xsum-sampled", hf_avail_splits=["train", "test", "validation"], @@ -15898,7 +15899,7 @@ super_glue_boolq_lighteval = LightevalTaskConfig( name="super_glue:boolq", suite=["lighteval", "superglue"], - prompt_function="boolq_harness", + prompt_function=prompt.boolq_harness, hf_repo="super_glue", hf_subset="boolq", hf_avail_splits=["test", "train", "validation"], @@ -15916,7 +15917,7 @@ super_glue_cb_lighteval = LightevalTaskConfig( name="super_glue:cb", suite=["lighteval", "superglue"], - prompt_function="cb", + prompt_function=prompt.cb, hf_repo="super_glue", hf_subset="cb", hf_avail_splits=["test", "train", "validation"], @@ -15934,7 +15935,7 @@ super_glue_copa_lighteval = LightevalTaskConfig( name="super_glue:copa", suite=["lighteval", "superglue"], - prompt_function="copa", + prompt_function=prompt.copa, hf_repo="super_glue", hf_subset="copa", hf_avail_splits=["test", "train", "validation"], @@ -15952,7 +15953,7 @@ super_glue_rte_lighteval = LightevalTaskConfig( name="super_glue:rte", suite=["lighteval", "superglue"], - prompt_function="rte", + prompt_function=prompt.rte, hf_repo="super_glue", hf_subset="rte", hf_avail_splits=["test", "train", "validation"], @@ -15970,7 +15971,7 @@ super_glue_multirc_lighteval = LightevalTaskConfig( name="super_glue:multirc", suite=["lighteval", "superglue"], - prompt_function="multirc", + prompt_function=prompt.multirc, hf_repo="super_glue", hf_subset="multirc", hf_avail_splits=["train", "validation"], @@ -15988,7 +15989,7 @@ super_glue_wic_lighteval = LightevalTaskConfig( name="super_glue:wic", suite=["lighteval", "superglue"], - prompt_function="wic", + prompt_function=prompt.wic, hf_repo="super_glue", hf_subset="wic", hf_avail_splits=["test", "train", "validation"], @@ -16006,7 +16007,7 @@ super_glue_wsc_lighteval = LightevalTaskConfig( name="super_glue:wsc", suite=["lighteval", "superglue"], - prompt_function="wsc", + prompt_function=prompt.wsc, hf_repo="super_glue", hf_subset="wsc", hf_avail_splits=["test", "train", "validation"], @@ -16024,7 +16025,7 @@ swahili_english_proverbs_bigbench = LightevalTaskConfig( name="swahili_english_proverbs", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="swahili_english_proverbs", hf_avail_splits=["default", "train", "validation"], @@ -16042,7 +16043,7 @@ swag_lighteval = LightevalTaskConfig( name="swag", suite=["lighteval"], - prompt_function="swag", + prompt_function=prompt.swag, hf_repo="swag", hf_subset="regular", hf_avail_splits=["train", "validation"], @@ -16060,7 +16061,7 @@ swedish_to_german_proverbs_bigbench = LightevalTaskConfig( name="swedish_to_german_proverbs", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="swedish_to_german_proverbs", hf_avail_splits=["default", "train", "validation"], @@ -16078,7 +16079,7 @@ symbol_interpretation_bigbench_lite = LightevalTaskConfig( name="symbol_interpretation", suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function="bigbench_linefeed_before_whitespace_after_query", + prompt_function=prompt.bigbench_linefeed_before_whitespace_after_query, hf_repo="bigbench", hf_subset="symbol_interpretation", hf_avail_splits=["default", "train", "validation"], @@ -16096,7 +16097,7 @@ synthetic_reasoning_induction_helm = LightevalTaskConfig( name="synthetic_reasoning:induction", suite=["helm"], - prompt_function="synthetic_reasoning", + prompt_function=prompt.synthetic_reasoning, hf_repo="lighteval/synthetic_reasoning", hf_subset="induction", hf_avail_splits=["train", "test", "validation"], @@ -16114,7 +16115,7 @@ synthetic_reasoning_natural_easy_helm = LightevalTaskConfig( name="synthetic_reasoning:natural_easy", suite=["helm"], - prompt_function="synthetic_reasoning_natural", + prompt_function=prompt.synthetic_reasoning_natural, hf_repo="lighteval/synthetic_reasoning_natural", hf_subset="easy", hf_avail_splits=["train", "test", "validation"], @@ -16132,7 +16133,7 @@ synthetic_reasoning_natural_hard_helm = LightevalTaskConfig( name="synthetic_reasoning:natural_hard", suite=["helm"], - prompt_function="synthetic_reasoning_natural", + prompt_function=prompt.synthetic_reasoning_natural, hf_repo="lighteval/synthetic_reasoning_natural", hf_subset="hard", hf_avail_splits=["train", "test", "validation"], @@ -16150,7 +16151,7 @@ synthetic_reasoning_pattern_match_helm = LightevalTaskConfig( name="synthetic_reasoning:pattern_match", suite=["helm"], - prompt_function="synthetic_reasoning", + prompt_function=prompt.synthetic_reasoning, hf_repo="lighteval/synthetic_reasoning", hf_subset="pattern_match", hf_avail_splits=["train", "test", "validation"], @@ -16168,7 +16169,7 @@ synthetic_reasoning_variable_substitution_helm = LightevalTaskConfig( name="synthetic_reasoning:variable_substitution", suite=["helm"], - prompt_function="synthetic_reasoning", + prompt_function=prompt.synthetic_reasoning, hf_repo="lighteval/synthetic_reasoning", hf_subset="variable_substitution", hf_avail_splits=["train", "test", "validation"], @@ -16186,7 +16187,7 @@ tellmewhy_bigbench = LightevalTaskConfig( name="tellmewhy", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="tellmewhy", hf_avail_splits=["default", "train", "validation"], @@ -16204,7 +16205,7 @@ temporal_sequences_bigbench = LightevalTaskConfig( name="temporal_sequences", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="temporal_sequences", hf_avail_splits=["default", "train", "validation"], @@ -16222,7 +16223,7 @@ tense_bigbench = LightevalTaskConfig( name="tense", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="tense", hf_avail_splits=["default", "train", "validation"], @@ -16240,7 +16241,7 @@ the_pile_arxiv_lighteval = LightevalTaskConfig( name="the_pile:arxiv", suite=["lighteval", "pile"], - prompt_function="the_pile", + prompt_function=prompt.the_pile, hf_repo="lighteval/pile", hf_subset="pile_arxiv", hf_avail_splits=["validation", "test"], @@ -16258,7 +16259,7 @@ the_pile_arxiv_helm = LightevalTaskConfig( name="the_pile:arxiv", suite=["helm"], - prompt_function="the_pile", + prompt_function=prompt.the_pile, hf_repo="lighteval/pile_helm", hf_subset="arxiv", hf_avail_splits=["test"], @@ -16276,7 +16277,7 @@ the_pile_bibliotik_helm = LightevalTaskConfig( name="the_pile:bibliotik", suite=["helm"], - prompt_function="the_pile", + prompt_function=prompt.the_pile, hf_repo="lighteval/pile_helm", hf_subset="bibliotik", hf_avail_splits=["test"], @@ -16294,7 +16295,7 @@ the_pile_bookcorpus2_lighteval = LightevalTaskConfig( name="the_pile:bookcorpus2", suite=["lighteval", "pile"], - prompt_function="the_pile", + prompt_function=prompt.the_pile, hf_repo="lighteval/pile", hf_subset="pile_bookcorpus2", hf_avail_splits=["validation", "test"], @@ -16312,7 +16313,7 @@ the_pile_books3_lighteval = LightevalTaskConfig( name="the_pile:books3", suite=["lighteval", "pile"], - prompt_function="the_pile", + prompt_function=prompt.the_pile, hf_repo="lighteval/pile", hf_subset="pile_books3", hf_avail_splits=["validation", "test"], @@ -16330,7 +16331,7 @@ the_pile_commoncrawl_helm = LightevalTaskConfig( name="the_pile:commoncrawl", suite=["helm"], - prompt_function="the_pile", + prompt_function=prompt.the_pile, hf_repo="lighteval/pile_helm", hf_subset="commoncrawl", hf_avail_splits=["test"], @@ -16348,7 +16349,7 @@ the_pile_dm_mathematics_lighteval = LightevalTaskConfig( name="the_pile:dm-mathematics", suite=["lighteval", "pile"], - prompt_function="the_pile", + prompt_function=prompt.the_pile, hf_repo="lighteval/pile", hf_subset="pile_dm-mathematics", hf_avail_splits=["validation", "test"], @@ -16366,7 +16367,7 @@ the_pile_dm_mathematics_helm = LightevalTaskConfig( name="the_pile:dm-mathematics", suite=["helm"], - prompt_function="the_pile", + prompt_function=prompt.the_pile, hf_repo="lighteval/pile_helm", hf_subset="dm-mathematics", hf_avail_splits=["test"], @@ -16384,7 +16385,7 @@ the_pile_enron_lighteval = LightevalTaskConfig( name="the_pile:enron", suite=["lighteval", "pile"], - prompt_function="the_pile", + prompt_function=prompt.the_pile, hf_repo="lighteval/pile", hf_subset="pile_enron", hf_avail_splits=["validation", "test"], @@ -16402,7 +16403,7 @@ the_pile_enron_helm = LightevalTaskConfig( name="the_pile:enron", suite=["helm"], - prompt_function="the_pile", + prompt_function=prompt.the_pile, hf_repo="lighteval/pile_helm", hf_subset="enron", hf_avail_splits=["test"], @@ -16420,7 +16421,7 @@ the_pile_europarl_lighteval = LightevalTaskConfig( name="the_pile:europarl", suite=["lighteval", "pile"], - prompt_function="the_pile", + prompt_function=prompt.the_pile, hf_repo="lighteval/pile", hf_subset="pile_europarl", hf_avail_splits=["validation", "test"], @@ -16438,7 +16439,7 @@ the_pile_europarl_helm = LightevalTaskConfig( name="the_pile:europarl", suite=["helm"], - prompt_function="the_pile", + prompt_function=prompt.the_pile, hf_repo="lighteval/pile_helm", hf_subset="europarl", hf_avail_splits=["test"], @@ -16456,7 +16457,7 @@ the_pile_freelaw_lighteval = LightevalTaskConfig( name="the_pile:freelaw", suite=["lighteval", "pile"], - prompt_function="the_pile", + prompt_function=prompt.the_pile, hf_repo="lighteval/pile", hf_subset="pile_freelaw", hf_avail_splits=["validation", "test"], @@ -16474,7 +16475,7 @@ the_pile_freelaw_helm = LightevalTaskConfig( name="the_pile:freelaw", suite=["helm"], - prompt_function="the_pile", + prompt_function=prompt.the_pile, hf_repo="lighteval/pile_helm", hf_subset="freelaw", hf_avail_splits=["test"], @@ -16492,7 +16493,7 @@ the_pile_github_lighteval = LightevalTaskConfig( name="the_pile:github", suite=["lighteval", "pile"], - prompt_function="the_pile", + prompt_function=prompt.the_pile, hf_repo="lighteval/pile", hf_subset="pile_github", hf_avail_splits=["validation", "test"], @@ -16510,7 +16511,7 @@ the_pile_github_helm = LightevalTaskConfig( name="the_pile:github", suite=["helm"], - prompt_function="the_pile", + prompt_function=prompt.the_pile, hf_repo="lighteval/pile_helm", hf_subset="github", hf_avail_splits=["test"], @@ -16528,7 +16529,7 @@ the_pile_gutenberg_lighteval = LightevalTaskConfig( name="the_pile:gutenberg", suite=["lighteval", "pile"], - prompt_function="the_pile", + prompt_function=prompt.the_pile, hf_repo="lighteval/pile", hf_subset="pile_gutenberg", hf_avail_splits=["validation", "test"], @@ -16546,7 +16547,7 @@ the_pile_gutenberg_helm = LightevalTaskConfig( name="the_pile:gutenberg", suite=["helm"], - prompt_function="the_pile", + prompt_function=prompt.the_pile, hf_repo="lighteval/pile_helm", hf_subset="gutenberg", hf_avail_splits=["test"], @@ -16564,7 +16565,7 @@ the_pile_hackernews_lighteval = LightevalTaskConfig( name="the_pile:hackernews", suite=["lighteval", "pile"], - prompt_function="the_pile", + prompt_function=prompt.the_pile, hf_repo="lighteval/pile", hf_subset="pile_hackernews", hf_avail_splits=["validation", "test"], @@ -16582,7 +16583,7 @@ the_pile_hackernews_helm = LightevalTaskConfig( name="the_pile:hackernews", suite=["helm"], - prompt_function="the_pile", + prompt_function=prompt.the_pile, hf_repo="lighteval/pile_helm", hf_subset="hackernews", hf_avail_splits=["test"], @@ -16600,7 +16601,7 @@ the_pile_nih_exporter_lighteval = LightevalTaskConfig( name="the_pile:nih-exporter", suite=["lighteval", "pile"], - prompt_function="the_pile", + prompt_function=prompt.the_pile, hf_repo="lighteval/pile", hf_subset="pile_nih-exporter", hf_avail_splits=["validation", "test"], @@ -16618,7 +16619,7 @@ the_pile_nih_exporter_helm = LightevalTaskConfig( name="the_pile:nih-exporter", suite=["helm"], - prompt_function="the_pile", + prompt_function=prompt.the_pile, hf_repo="lighteval/pile_helm", hf_subset="nih-exporter", hf_avail_splits=["test"], @@ -16636,7 +16637,7 @@ the_pile_opensubtitles_lighteval = LightevalTaskConfig( name="the_pile:opensubtitles", suite=["lighteval", "pile"], - prompt_function="the_pile", + prompt_function=prompt.the_pile, hf_repo="lighteval/pile", hf_subset="pile_opensubtitles", hf_avail_splits=["validation", "test"], @@ -16654,7 +16655,7 @@ the_pile_opensubtitles_helm = LightevalTaskConfig( name="the_pile:opensubtitles", suite=["helm"], - prompt_function="the_pile", + prompt_function=prompt.the_pile, hf_repo="lighteval/pile_helm", hf_subset="opensubtitles", hf_avail_splits=["test"], @@ -16672,7 +16673,7 @@ the_pile_openwebtext2_lighteval = LightevalTaskConfig( name="the_pile:openwebtext2", suite=["lighteval", "pile"], - prompt_function="the_pile", + prompt_function=prompt.the_pile, hf_repo="lighteval/pile", hf_subset="pile_openwebtext2", hf_avail_splits=["validation", "test"], @@ -16690,7 +16691,7 @@ the_pile_openwebtext2_helm = LightevalTaskConfig( name="the_pile:openwebtext2", suite=["helm"], - prompt_function="the_pile", + prompt_function=prompt.the_pile, hf_repo="lighteval/pile_helm", hf_subset="openwebtext2", hf_avail_splits=["test"], @@ -16708,7 +16709,7 @@ the_pile_philpapers_lighteval = LightevalTaskConfig( name="the_pile:philpapers", suite=["lighteval", "pile"], - prompt_function="the_pile", + prompt_function=prompt.the_pile, hf_repo="lighteval/pile", hf_subset="pile_philpapers", hf_avail_splits=["validation", "test"], @@ -16726,7 +16727,7 @@ the_pile_pile_cc_lighteval = LightevalTaskConfig( name="the_pile:pile-cc", suite=["lighteval", "pile"], - prompt_function="the_pile", + prompt_function=prompt.the_pile, hf_repo="lighteval/pile", hf_subset="pile_pile-cc", hf_avail_splits=["validation", "test"], @@ -16744,7 +16745,7 @@ the_pile_pubmed_abstracts_lighteval = LightevalTaskConfig( name="the_pile:pubmed-abstracts", suite=["lighteval", "pile"], - prompt_function="the_pile", + prompt_function=prompt.the_pile, hf_repo="lighteval/pile", hf_subset="pile_pubmed-abstracts", hf_avail_splits=["validation", "test"], @@ -16762,7 +16763,7 @@ the_pile_pubmed_abstracts_helm = LightevalTaskConfig( name="the_pile:pubmed-abstracts", suite=["helm"], - prompt_function="the_pile", + prompt_function=prompt.the_pile, hf_repo="lighteval/pile_helm", hf_subset="pubmed-abstracts", hf_avail_splits=["test"], @@ -16780,7 +16781,7 @@ the_pile_pubmed_central_lighteval = LightevalTaskConfig( name="the_pile:pubmed-central", suite=["lighteval", "pile"], - prompt_function="the_pile", + prompt_function=prompt.the_pile, hf_repo="lighteval/pile", hf_subset="pile_pubmed-central", hf_avail_splits=["validation", "test"], @@ -16798,7 +16799,7 @@ the_pile_pubmed_central_helm = LightevalTaskConfig( name="the_pile:pubmed-central", suite=["helm"], - prompt_function="the_pile", + prompt_function=prompt.the_pile, hf_repo="lighteval/pile_helm", hf_subset="pubmed-central", hf_avail_splits=["test"], @@ -16816,7 +16817,7 @@ the_pile_stackexchange_lighteval = LightevalTaskConfig( name="the_pile:stackexchange", suite=["lighteval", "pile"], - prompt_function="the_pile", + prompt_function=prompt.the_pile, hf_repo="lighteval/pile", hf_subset="pile_stackexchange", hf_avail_splits=["validation", "test"], @@ -16834,7 +16835,7 @@ the_pile_stackexchange_helm = LightevalTaskConfig( name="the_pile:stackexchange", suite=["helm"], - prompt_function="the_pile", + prompt_function=prompt.the_pile, hf_repo="lighteval/pile_helm", hf_subset="stackexchange", hf_avail_splits=["test"], @@ -16852,7 +16853,7 @@ the_pile_ubuntu_irc_lighteval = LightevalTaskConfig( name="the_pile:ubuntu-irc", suite=["lighteval", "pile"], - prompt_function="the_pile", + prompt_function=prompt.the_pile, hf_repo="lighteval/pile", hf_subset="pile_ubuntu-irc", hf_avail_splits=["validation", "test"], @@ -16870,7 +16871,7 @@ the_pile_uspto_lighteval = LightevalTaskConfig( name="the_pile:uspto", suite=["lighteval", "pile"], - prompt_function="the_pile", + prompt_function=prompt.the_pile, hf_repo="lighteval/pile", hf_subset="pile_upsto", hf_avail_splits=["validation", "test"], @@ -16888,7 +16889,7 @@ the_pile_upsto_helm = LightevalTaskConfig( name="the_pile:upsto", suite=["helm"], - prompt_function="the_pile", + prompt_function=prompt.the_pile, hf_repo="lighteval/pile_helm", hf_subset="uspto", hf_avail_splits=["test"], @@ -16906,7 +16907,7 @@ the_pile_wikipedia_lighteval = LightevalTaskConfig( name="the_pile:wikipedia", suite=["lighteval", "pile"], - prompt_function="the_pile", + prompt_function=prompt.the_pile, hf_repo="lighteval/pile", hf_subset="pile_wikipedia", hf_avail_splits=["validation", "test"], @@ -16924,7 +16925,7 @@ the_pile_wikipedia_helm = LightevalTaskConfig( name="the_pile:wikipedia", suite=["helm"], - prompt_function="the_pile", + prompt_function=prompt.the_pile, hf_repo="lighteval/pile_helm", hf_subset="wikipedia", hf_avail_splits=["test"], @@ -16942,7 +16943,7 @@ the_pile_youtubesubtitles_lighteval = LightevalTaskConfig( name="the_pile:youtubesubtitles", suite=["lighteval", "pile"], - prompt_function="the_pile", + prompt_function=prompt.the_pile, hf_repo="lighteval/pile", hf_subset="pile_youtubesubtitles", hf_avail_splits=["validation", "test"], @@ -16960,7 +16961,7 @@ the_pile_youtubesubtitles_helm = LightevalTaskConfig( name="the_pile:youtubesubtitles", suite=["helm"], - prompt_function="the_pile", + prompt_function=prompt.the_pile, hf_repo="lighteval/pile_helm", hf_subset="youtubesubtitles", hf_avail_splits=["test"], @@ -16978,7 +16979,7 @@ timedial_bigbench = LightevalTaskConfig( name="timedial", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="timedial", hf_avail_splits=["default", "train", "validation"], @@ -16996,7 +16997,7 @@ toxigen_lighteval = LightevalTaskConfig( name="toxigen", suite=["lighteval"], - prompt_function="toxigen", + prompt_function=prompt.toxigen, hf_repo="skg/toxigen-data", hf_subset="annotated", hf_avail_splits=["train", "test"], @@ -17014,7 +17015,7 @@ topical_chat_bigbench = LightevalTaskConfig( name="topical_chat", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="topical_chat", hf_avail_splits=["default", "train", "validation"], @@ -17032,7 +17033,7 @@ tracking_shuffled_objects_bigbench = LightevalTaskConfig( name="tracking_shuffled_objects", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="tracking_shuffled_objects", hf_avail_splits=["default", "train", "validation"], @@ -17050,7 +17051,7 @@ triviaqa_lighteval = LightevalTaskConfig( name="triviaqa", suite=["lighteval"], - prompt_function="triviaqa", + prompt_function=prompt.triviaqa, hf_repo="trivia_qa", hf_subset="rc.nocontext", hf_avail_splits=["train", "test", "validation"], @@ -17068,7 +17069,7 @@ truthfulqa_gen_lighteval = LightevalTaskConfig( name="truthfulqa:gen", suite=["lighteval"], - prompt_function="truthful_qa_generative", + prompt_function=prompt.truthful_qa_generative, hf_repo="truthful_qa", hf_subset="generation", hf_avail_splits=["validation"], @@ -17086,7 +17087,7 @@ truthfulqa_mc_leaderboard = LightevalTaskConfig( name="truthfulqa:mc", suite=["leaderboard"], - prompt_function="truthful_qa_multiple_choice", + prompt_function=prompt.truthful_qa_multiple_choice, hf_repo="truthful_qa", hf_subset="multiple_choice", hf_avail_splits=["validation"], @@ -17104,7 +17105,7 @@ truthfulqa_helm = LightevalTaskConfig( name="truthfulqa", suite=["helm", "helm_general"], - prompt_function="truthful_qa_helm", + prompt_function=prompt.truthful_qa_helm, hf_repo="lighteval/truthfulqa_helm", hf_subset="default", hf_avail_splits=["train", "valid"], @@ -17122,7 +17123,7 @@ twitterAAE_aa_helm = LightevalTaskConfig( name="twitterAAE:aa", suite=["helm"], - prompt_function="twitter_aae", + prompt_function=prompt.twitter_aae, hf_repo="lighteval/twitterAAE", hf_subset="aa", hf_avail_splits=["test"], @@ -17140,7 +17141,7 @@ twitterAAE_white_helm = LightevalTaskConfig( name="twitterAAE:white", suite=["helm"], - prompt_function="twitter_aae", + prompt_function=prompt.twitter_aae, hf_repo="lighteval/twitterAAE", hf_subset="white", hf_avail_splits=["test"], @@ -17158,7 +17159,7 @@ understanding_fables_bigbench = LightevalTaskConfig( name="understanding_fables", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="understanding_fables", hf_avail_splits=["default", "train", "validation"], @@ -17176,7 +17177,7 @@ undo_permutation_bigbench = LightevalTaskConfig( name="undo_permutation", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="undo_permutation", hf_avail_splits=["default", "train", "validation"], @@ -17194,7 +17195,7 @@ unit_conversion_bigbench = LightevalTaskConfig( name="unit_conversion", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="unit_conversion", hf_avail_splits=["default", "train", "validation"], @@ -17212,7 +17213,7 @@ unit_interpretation_bigbench = LightevalTaskConfig( name="unit_interpretation", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="unit_interpretation", hf_avail_splits=["default", "train", "validation"], @@ -17230,7 +17231,7 @@ unnatural_in_context_learning_bigbench = LightevalTaskConfig( name="unnatural_in_context_learning", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="unnatural_in_context_learning", hf_avail_splits=["default", "train", "validation"], @@ -17248,7 +17249,7 @@ unscramble_anagrams1_lighteval = LightevalTaskConfig( name="unscramble:anagrams1", suite=["lighteval", "unscramble"], - prompt_function="unscramble", + prompt_function=prompt.unscramble, hf_repo="lighteval/GPT3_unscramble", hf_subset="default", hf_avail_splits=["mid_word_1_anagrams"], @@ -17266,7 +17267,7 @@ unscramble_anagrams2_lighteval = LightevalTaskConfig( name="unscramble:anagrams2", suite=["lighteval", "unscramble"], - prompt_function="unscramble", + prompt_function=prompt.unscramble, hf_repo="lighteval/GPT3_unscramble", hf_subset="default", hf_avail_splits=["mid_word_2_anagrams"], @@ -17284,7 +17285,7 @@ unscramble_cycle_letters_lighteval = LightevalTaskConfig( name="unscramble:cycle_letters", suite=["lighteval", "unscramble"], - prompt_function="unscramble", + prompt_function=prompt.unscramble, hf_repo="lighteval/GPT3_unscramble", hf_subset="default", hf_avail_splits=["cycle_letters_in_word"], @@ -17302,7 +17303,7 @@ unscramble_random_insertion_lighteval = LightevalTaskConfig( name="unscramble:random_insertion", suite=["lighteval", "unscramble"], - prompt_function="unscramble", + prompt_function=prompt.unscramble, hf_repo="lighteval/GPT3_unscramble", hf_subset="default", hf_avail_splits=["random_insertion_in_word"], @@ -17320,7 +17321,7 @@ unscramble_reversed_words_lighteval = LightevalTaskConfig( name="unscramble:reversed_words", suite=["lighteval", "unscramble"], - prompt_function="unscramble", + prompt_function=prompt.unscramble, hf_repo="lighteval/GPT3_unscramble", hf_subset="default", hf_avail_splits=["reversed_words"], @@ -17338,7 +17339,7 @@ vitaminc_fact_verification_bigbench_lite = LightevalTaskConfig( name="vitaminc_fact_verification", suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function="bigbench_whitespace_after_query", + prompt_function=prompt.bigbench_whitespace_after_query, hf_repo="bigbench", hf_subset="vitaminc_fact_verification", hf_avail_splits=["default", "train", "validation"], @@ -17356,7 +17357,7 @@ webqs_lighteval = LightevalTaskConfig( name="webqs", suite=["lighteval"], - prompt_function="webqs", + prompt_function=prompt.webqs, hf_repo="web_questions", hf_subset="default", hf_avail_splits=["train", "test"], @@ -17374,7 +17375,7 @@ what_is_the_tao_bigbench = LightevalTaskConfig( name="what_is_the_tao", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="what_is_the_tao", hf_avail_splits=["default", "train", "validation"], @@ -17392,7 +17393,7 @@ which_wiki_edit_bigbench = LightevalTaskConfig( name="which_wiki_edit", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="which_wiki_edit", hf_avail_splits=["default", "train", "validation"], @@ -17410,7 +17411,7 @@ wikifact_applies_to_jurisdiction_helm = LightevalTaskConfig( name="wikifact:applies_to_jurisdiction", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="applies_to_jurisdiction", hf_avail_splits=["train", "test"], @@ -17428,7 +17429,7 @@ wikifact_atomic_number_helm = LightevalTaskConfig( name="wikifact:atomic_number", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="atomic_number", hf_avail_splits=["train", "test"], @@ -17446,7 +17447,7 @@ wikifact_author_helm = LightevalTaskConfig( name="wikifact:author", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="author", hf_avail_splits=["train", "test"], @@ -17464,7 +17465,7 @@ wikifact_award_received_helm = LightevalTaskConfig( name="wikifact:award_received", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="award_received", hf_avail_splits=["train", "test"], @@ -17482,7 +17483,7 @@ wikifact_basic_form_of_government_helm = LightevalTaskConfig( name="wikifact:basic_form_of_government", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="basic_form_of_government", hf_avail_splits=["train", "test"], @@ -17500,7 +17501,7 @@ wikifact_capital_helm = LightevalTaskConfig( name="wikifact:capital", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="capital", hf_avail_splits=["train", "test"], @@ -17518,7 +17519,7 @@ wikifact_capital_of_helm = LightevalTaskConfig( name="wikifact:capital_of", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="capital_of", hf_avail_splits=["train", "test"], @@ -17536,7 +17537,7 @@ wikifact_central_bank_helm = LightevalTaskConfig( name="wikifact:central_bank", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="central_bank", hf_avail_splits=["train", "test"], @@ -17554,7 +17555,7 @@ wikifact_composer_helm = LightevalTaskConfig( name="wikifact:composer", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="composer", hf_avail_splits=["train", "test"], @@ -17572,7 +17573,7 @@ wikifact_continent_helm = LightevalTaskConfig( name="wikifact:continent", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="continent", hf_avail_splits=["train", "test"], @@ -17590,7 +17591,7 @@ wikifact_country_helm = LightevalTaskConfig( name="wikifact:country", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="country", hf_avail_splits=["train", "test"], @@ -17608,7 +17609,7 @@ wikifact_country_of_citizenship_helm = LightevalTaskConfig( name="wikifact:country_of_citizenship", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="country_of_citizenship", hf_avail_splits=["train", "test"], @@ -17626,7 +17627,7 @@ wikifact_country_of_origin_helm = LightevalTaskConfig( name="wikifact:country_of_origin", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="country_of_origin", hf_avail_splits=["train", "test"], @@ -17644,7 +17645,7 @@ wikifact_creator_helm = LightevalTaskConfig( name="wikifact:creator", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="creator", hf_avail_splits=["train", "test"], @@ -17662,7 +17663,7 @@ wikifact_currency_helm = LightevalTaskConfig( name="wikifact:currency", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="currency", hf_avail_splits=["train", "test"], @@ -17680,7 +17681,7 @@ wikifact_defendant_helm = LightevalTaskConfig( name="wikifact:defendant", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="defendant", hf_avail_splits=["train", "test"], @@ -17698,7 +17699,7 @@ wikifact_developer_helm = LightevalTaskConfig( name="wikifact:developer", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="developer", hf_avail_splits=["train", "test"], @@ -17716,7 +17717,7 @@ wikifact_diplomatic_relation_helm = LightevalTaskConfig( name="wikifact:diplomatic_relation", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="diplomatic_relation", hf_avail_splits=["train", "test"], @@ -17734,7 +17735,7 @@ wikifact_director_helm = LightevalTaskConfig( name="wikifact:director", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="director", hf_avail_splits=["train", "test"], @@ -17752,7 +17753,7 @@ wikifact_discoverer_or_inventor_helm = LightevalTaskConfig( name="wikifact:discoverer_or_inventor", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="discoverer_or_inventor", hf_avail_splits=["train", "test"], @@ -17770,7 +17771,7 @@ wikifact_drug_or_therapy_used_for_treatment_helm = LightevalTaskConfig( name="wikifact:drug_or_therapy_used_for_treatment", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="drug_or_therapy_used_for_treatment", hf_avail_splits=["train", "test"], @@ -17788,7 +17789,7 @@ wikifact_educated_at_helm = LightevalTaskConfig( name="wikifact:educated_at", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="educated_at", hf_avail_splits=["train", "test"], @@ -17806,7 +17807,7 @@ wikifact_electron_configuration_helm = LightevalTaskConfig( name="wikifact:electron_configuration", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="electron_configuration", hf_avail_splits=["train", "test"], @@ -17824,7 +17825,7 @@ wikifact_employer_helm = LightevalTaskConfig( name="wikifact:employer", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="employer", hf_avail_splits=["train", "test"], @@ -17842,7 +17843,7 @@ wikifact_field_of_work_helm = LightevalTaskConfig( name="wikifact:field_of_work", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="field_of_work", hf_avail_splits=["train", "test"], @@ -17860,7 +17861,7 @@ wikifact_file_extension_helm = LightevalTaskConfig( name="wikifact:file_extension", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="file_extension", hf_avail_splits=["train", "test"], @@ -17878,7 +17879,7 @@ wikifact_genetic_association_helm = LightevalTaskConfig( name="wikifact:genetic_association", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="genetic_association", hf_avail_splits=["train", "test"], @@ -17896,7 +17897,7 @@ wikifact_genre_helm = LightevalTaskConfig( name="wikifact:genre", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="genre", hf_avail_splits=["train", "test"], @@ -17914,7 +17915,7 @@ wikifact_has_part_helm = LightevalTaskConfig( name="wikifact:has_part", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="has_part", hf_avail_splits=["train", "test"], @@ -17932,7 +17933,7 @@ wikifact_head_of_government_helm = LightevalTaskConfig( name="wikifact:head_of_government", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="head_of_government", hf_avail_splits=["train", "test"], @@ -17950,7 +17951,7 @@ wikifact_head_of_state_helm = LightevalTaskConfig( name="wikifact:head_of_state", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="head_of_state", hf_avail_splits=["train", "test"], @@ -17968,7 +17969,7 @@ wikifact_headquarters_location_helm = LightevalTaskConfig( name="wikifact:headquarters_location", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="headquarters_location", hf_avail_splits=["train", "test"], @@ -17986,7 +17987,7 @@ wikifact_industry_helm = LightevalTaskConfig( name="wikifact:industry", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="industry", hf_avail_splits=["train", "test"], @@ -18004,7 +18005,7 @@ wikifact_influenced_by_helm = LightevalTaskConfig( name="wikifact:influenced_by", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="influenced_by", hf_avail_splits=["train", "test"], @@ -18022,7 +18023,7 @@ wikifact_instance_of_helm = LightevalTaskConfig( name="wikifact:instance_of", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="instance_of", hf_avail_splits=["train", "test"], @@ -18040,7 +18041,7 @@ wikifact_instrument_helm = LightevalTaskConfig( name="wikifact:instrument", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="instrument", hf_avail_splits=["train", "test"], @@ -18058,7 +18059,7 @@ wikifact_language_of_work_or_name_helm = LightevalTaskConfig( name="wikifact:language_of_work_or_name", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="language_of_work_or_name", hf_avail_splits=["train", "test"], @@ -18076,7 +18077,7 @@ wikifact_languages_spoken_written_or_signed_helm = LightevalTaskConfig( name="wikifact:languages_spoken_written_or_signed", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="languages_spoken_written_or_signed", hf_avail_splits=["train", "test"], @@ -18094,7 +18095,7 @@ wikifact_laws_applied_helm = LightevalTaskConfig( name="wikifact:laws_applied", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="laws_applied", hf_avail_splits=["train", "test"], @@ -18112,7 +18113,7 @@ wikifact_located_in_the_administrative_territorial_entity_helm = LightevalTaskConfig( name="wikifact:located_in_the_administrative_territorial_entity", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="located_in_the_administrative_territorial_entity", hf_avail_splits=["train", "test"], @@ -18130,7 +18131,7 @@ wikifact_location_helm = LightevalTaskConfig( name="wikifact:location", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="location", hf_avail_splits=["train", "test"], @@ -18148,7 +18149,7 @@ wikifact_location_of_discovery_helm = LightevalTaskConfig( name="wikifact:location_of_discovery", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="location_of_discovery", hf_avail_splits=["train", "test"], @@ -18166,7 +18167,7 @@ wikifact_location_of_formation_helm = LightevalTaskConfig( name="wikifact:location_of_formation", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="location_of_formation", hf_avail_splits=["train", "test"], @@ -18184,7 +18185,7 @@ wikifact_majority_opinion_by_helm = LightevalTaskConfig( name="wikifact:majority_opinion_by", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="majority_opinion_by", hf_avail_splits=["train", "test"], @@ -18202,7 +18203,7 @@ wikifact_manufacturer_helm = LightevalTaskConfig( name="wikifact:manufacturer", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="manufacturer", hf_avail_splits=["train", "test"], @@ -18220,7 +18221,7 @@ wikifact_measured_physical_quantity_helm = LightevalTaskConfig( name="wikifact:measured_physical_quantity", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="measured_physical_quantity", hf_avail_splits=["train", "test"], @@ -18238,7 +18239,7 @@ wikifact_medical_condition_treated_helm = LightevalTaskConfig( name="wikifact:medical_condition_treated", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="medical_condition_treated", hf_avail_splits=["train", "test"], @@ -18256,7 +18257,7 @@ wikifact_member_of_helm = LightevalTaskConfig( name="wikifact:member_of", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="member_of", hf_avail_splits=["train", "test"], @@ -18274,7 +18275,7 @@ wikifact_member_of_political_party_helm = LightevalTaskConfig( name="wikifact:member_of_political_party", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="member_of_political_party", hf_avail_splits=["train", "test"], @@ -18292,7 +18293,7 @@ wikifact_member_of_sports_team_helm = LightevalTaskConfig( name="wikifact:member_of_sports_team", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="member_of_sports_team", hf_avail_splits=["train", "test"], @@ -18310,7 +18311,7 @@ wikifact_movement_helm = LightevalTaskConfig( name="wikifact:movement", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="movement", hf_avail_splits=["train", "test"], @@ -18328,7 +18329,7 @@ wikifact_named_after_helm = LightevalTaskConfig( name="wikifact:named_after", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="named_after", hf_avail_splits=["train", "test"], @@ -18346,7 +18347,7 @@ wikifact_native_language_helm = LightevalTaskConfig( name="wikifact:native_language", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="native_language", hf_avail_splits=["train", "test"], @@ -18364,7 +18365,7 @@ wikifact_number_of_processor_cores_helm = LightevalTaskConfig( name="wikifact:number_of_processor_cores", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="number_of_processor_cores", hf_avail_splits=["train", "test"], @@ -18382,7 +18383,7 @@ wikifact_occupation_helm = LightevalTaskConfig( name="wikifact:occupation", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="occupation", hf_avail_splits=["train", "test"], @@ -18400,7 +18401,7 @@ wikifact_office_held_by_head_of_government_helm = LightevalTaskConfig( name="wikifact:office_held_by_head_of_government", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="office_held_by_head_of_government", hf_avail_splits=["train", "test"], @@ -18418,7 +18419,7 @@ wikifact_office_held_by_head_of_state_helm = LightevalTaskConfig( name="wikifact:office_held_by_head_of_state", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="office_held_by_head_of_state", hf_avail_splits=["train", "test"], @@ -18436,7 +18437,7 @@ wikifact_official_language_helm = LightevalTaskConfig( name="wikifact:official_language", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="official_language", hf_avail_splits=["train", "test"], @@ -18454,7 +18455,7 @@ wikifact_operating_system_helm = LightevalTaskConfig( name="wikifact:operating_system", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="operating_system", hf_avail_splits=["train", "test"], @@ -18472,7 +18473,7 @@ wikifact_original_language_of_film_or_TV_show_helm = LightevalTaskConfig( name="wikifact:original_language_of_film_or_TV_show", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="original_language_of_film_or_TV_show", hf_avail_splits=["train", "test"], @@ -18490,7 +18491,7 @@ wikifact_original_network_helm = LightevalTaskConfig( name="wikifact:original_network", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="original_network", hf_avail_splits=["train", "test"], @@ -18508,7 +18509,7 @@ wikifact_overrules_helm = LightevalTaskConfig( name="wikifact:overrules", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="overrules", hf_avail_splits=["train", "test"], @@ -18526,7 +18527,7 @@ wikifact_owned_by_helm = LightevalTaskConfig( name="wikifact:owned_by", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="owned_by", hf_avail_splits=["train", "test"], @@ -18544,7 +18545,7 @@ wikifact_part_of_helm = LightevalTaskConfig( name="wikifact:part_of", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="part_of", hf_avail_splits=["train", "test"], @@ -18562,7 +18563,7 @@ wikifact_participating_team_helm = LightevalTaskConfig( name="wikifact:participating_team", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="participating_team", hf_avail_splits=["train", "test"], @@ -18580,7 +18581,7 @@ wikifact_place_of_birth_helm = LightevalTaskConfig( name="wikifact:place_of_birth", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="place_of_birth", hf_avail_splits=["train", "test"], @@ -18598,7 +18599,7 @@ wikifact_place_of_death_helm = LightevalTaskConfig( name="wikifact:place_of_death", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="place_of_death", hf_avail_splits=["train", "test"], @@ -18616,7 +18617,7 @@ wikifact_plaintiff_helm = LightevalTaskConfig( name="wikifact:plaintiff", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="plaintiff", hf_avail_splits=["train", "test"], @@ -18634,7 +18635,7 @@ wikifact_position_held_helm = LightevalTaskConfig( name="wikifact:position_held", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="position_held", hf_avail_splits=["train", "test"], @@ -18652,7 +18653,7 @@ wikifact_position_played_on_team_helm = LightevalTaskConfig( name="wikifact:position_played_on_team", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="position_played_on_team", hf_avail_splits=["train", "test"], @@ -18670,7 +18671,7 @@ wikifact_programming_language_helm = LightevalTaskConfig( name="wikifact:programming_language", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="programming_language", hf_avail_splits=["train", "test"], @@ -18688,7 +18689,7 @@ wikifact_recommended_unit_of_measurement_helm = LightevalTaskConfig( name="wikifact:recommended_unit_of_measurement", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="recommended_unit_of_measurement", hf_avail_splits=["train", "test"], @@ -18706,7 +18707,7 @@ wikifact_record_label_helm = LightevalTaskConfig( name="wikifact:record_label", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="record_label", hf_avail_splits=["train", "test"], @@ -18724,7 +18725,7 @@ wikifact_religion_helm = LightevalTaskConfig( name="wikifact:religion", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="religion", hf_avail_splits=["train", "test"], @@ -18742,7 +18743,7 @@ wikifact_repealed_by_helm = LightevalTaskConfig( name="wikifact:repealed_by", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="repealed_by", hf_avail_splits=["train", "test"], @@ -18760,7 +18761,7 @@ wikifact_shares_border_with_helm = LightevalTaskConfig( name="wikifact:shares_border_with", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="shares_border_with", hf_avail_splits=["train", "test"], @@ -18778,7 +18779,7 @@ wikifact_solved_by_helm = LightevalTaskConfig( name="wikifact:solved_by", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="solved_by", hf_avail_splits=["train", "test"], @@ -18796,7 +18797,7 @@ wikifact_statement_describes_helm = LightevalTaskConfig( name="wikifact:statement_describes", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="statement_describes", hf_avail_splits=["train", "test"], @@ -18814,7 +18815,7 @@ wikifact_stock_exchange_helm = LightevalTaskConfig( name="wikifact:stock_exchange", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="stock_exchange", hf_avail_splits=["train", "test"], @@ -18832,7 +18833,7 @@ wikifact_subclass_of_helm = LightevalTaskConfig( name="wikifact:subclass_of", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="subclass_of", hf_avail_splits=["train", "test"], @@ -18850,7 +18851,7 @@ wikifact_subsidiary_helm = LightevalTaskConfig( name="wikifact:subsidiary", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="subsidiary", hf_avail_splits=["train", "test"], @@ -18868,7 +18869,7 @@ wikifact_symptoms_and_signs_helm = LightevalTaskConfig( name="wikifact:symptoms_and_signs", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="symptoms_and_signs", hf_avail_splits=["train", "test"], @@ -18886,7 +18887,7 @@ wikifact_therapeutic_area_helm = LightevalTaskConfig( name="wikifact:therapeutic_area", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="therapeutic_area", hf_avail_splits=["train", "test"], @@ -18904,7 +18905,7 @@ wikifact_time_of_discovery_or_invention_helm = LightevalTaskConfig( name="wikifact:time_of_discovery_or_invention", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="time_of_discovery_or_invention", hf_avail_splits=["train", "test"], @@ -18922,7 +18923,7 @@ wikifact_twinned_administrative_body_helm = LightevalTaskConfig( name="wikifact:twinned_administrative_body", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="twinned_administrative_body", hf_avail_splits=["train", "test"], @@ -18940,7 +18941,7 @@ wikifact_work_location_helm = LightevalTaskConfig( name="wikifact:work_location", suite=["helm"], - prompt_function="wikifact", + prompt_function=prompt.wikifact, hf_repo="lighteval/wikifact", hf_subset="work_location", hf_avail_splits=["train", "test"], @@ -18958,7 +18959,7 @@ wikitext_2_lighteval = LightevalTaskConfig( name="wikitext:2", suite=["lighteval"], - prompt_function="wikitext", + prompt_function=prompt.wikitext, hf_repo="wikitext", hf_subset="wikitext-2-raw-v1", hf_avail_splits=["train", "validation", "test"], @@ -18976,7 +18977,7 @@ wikitext_103_document_level_harness = LightevalTaskConfig( name="wikitext:103:document_level", suite=["harness"], - prompt_function="wikitext_harness", + prompt_function=prompt.wikitext_harness, hf_repo="EleutherAI/wikitext_document_level", hf_subset="wikitext-103-raw-v1", hf_avail_splits=["train", "test"], @@ -18994,7 +18995,7 @@ wikitext_103_document_level_helm = LightevalTaskConfig( name="wikitext:103:document_level", suite=["helm"], - prompt_function="wikitext_helm", + prompt_function=prompt.wikitext_helm, hf_repo="EleutherAI/wikitext_document_level", hf_subset="wikitext-103-raw-v1", hf_avail_splits=["train", "test"], @@ -19012,7 +19013,7 @@ wino_x_german_bigbench = LightevalTaskConfig( name="wino_x_german", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="wino_x_german", hf_avail_splits=["default", "train", "validation"], @@ -19030,7 +19031,7 @@ winogrande_leaderboard = LightevalTaskConfig( name="winogrande", suite=["leaderboard"], - prompt_function="winogrande", + prompt_function=prompt.winogrande, hf_repo="winogrande", hf_subset="winogrande_xl", hf_avail_splits=["train", "test", "validation"], @@ -19048,7 +19049,7 @@ winowhy_bigbench_lite = LightevalTaskConfig( name="winowhy", suite=["bigbench_lite", "bigbench", "bigbench_json"], - prompt_function="bigbench_whitespace_after_query", + prompt_function=prompt.bigbench_whitespace_after_query, hf_repo="bigbench", hf_subset="winowhy", hf_avail_splits=["default", "train", "validation"], @@ -19066,7 +19067,7 @@ wmt08_cs_en_lighteval = LightevalTaskConfig( name="wmt08:cs-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt08_cs-en", hf_avail_splits=["test"], @@ -19084,7 +19085,7 @@ wmt08_de_en_lighteval = LightevalTaskConfig( name="wmt08:de-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt08_de-en", hf_avail_splits=["test"], @@ -19102,7 +19103,7 @@ wmt08_en_cs_lighteval = LightevalTaskConfig( name="wmt08:en-cs", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt08_en-cs", hf_avail_splits=["test"], @@ -19120,7 +19121,7 @@ wmt08_en_de_lighteval = LightevalTaskConfig( name="wmt08:en-de", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt08_en-de", hf_avail_splits=["test"], @@ -19138,7 +19139,7 @@ wmt08_en_es_lighteval = LightevalTaskConfig( name="wmt08:en-es", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt08_en-es", hf_avail_splits=["test"], @@ -19156,7 +19157,7 @@ wmt08_en_fr_lighteval = LightevalTaskConfig( name="wmt08:en-fr", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt08_en-fr", hf_avail_splits=["test"], @@ -19174,7 +19175,7 @@ wmt08_en_hu_lighteval = LightevalTaskConfig( name="wmt08:en-hu", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt08_en-hu", hf_avail_splits=["test"], @@ -19192,7 +19193,7 @@ wmt08_es_en_lighteval = LightevalTaskConfig( name="wmt08:es-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt08_es-en", hf_avail_splits=["test"], @@ -19210,7 +19211,7 @@ wmt08_fr_en_lighteval = LightevalTaskConfig( name="wmt08:fr-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt08_fr-en", hf_avail_splits=["test"], @@ -19228,7 +19229,7 @@ wmt08_hu_en_lighteval = LightevalTaskConfig( name="wmt08:hu-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt08_hu-en", hf_avail_splits=["test"], @@ -19246,7 +19247,7 @@ wmt09_cs_en_lighteval = LightevalTaskConfig( name="wmt09:cs-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt09_cs-en", hf_avail_splits=["test"], @@ -19264,7 +19265,7 @@ wmt09_de_en_lighteval = LightevalTaskConfig( name="wmt09:de-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt09_de-en", hf_avail_splits=["test"], @@ -19282,7 +19283,7 @@ wmt09_en_cs_lighteval = LightevalTaskConfig( name="wmt09:en-cs", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt09_en-cs", hf_avail_splits=["test"], @@ -19300,7 +19301,7 @@ wmt09_en_de_lighteval = LightevalTaskConfig( name="wmt09:en-de", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt09_en-de", hf_avail_splits=["test"], @@ -19318,7 +19319,7 @@ wmt09_en_es_lighteval = LightevalTaskConfig( name="wmt09:en-es", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt09_en-es", hf_avail_splits=["test"], @@ -19336,7 +19337,7 @@ wmt09_en_fr_lighteval = LightevalTaskConfig( name="wmt09:en-fr", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt09_en-fr", hf_avail_splits=["test"], @@ -19354,7 +19355,7 @@ wmt09_en_hu_lighteval = LightevalTaskConfig( name="wmt09:en-hu", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt09_en-hu", hf_avail_splits=["test"], @@ -19372,7 +19373,7 @@ wmt09_en_it_lighteval = LightevalTaskConfig( name="wmt09:en-it", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt09_en-it", hf_avail_splits=["test"], @@ -19390,7 +19391,7 @@ wmt09_es_en_lighteval = LightevalTaskConfig( name="wmt09:es-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt09_es-en", hf_avail_splits=["test"], @@ -19408,7 +19409,7 @@ wmt09_fr_en_lighteval = LightevalTaskConfig( name="wmt09:fr-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt09_fr-en", hf_avail_splits=["test"], @@ -19426,7 +19427,7 @@ wmt09_hu_en_lighteval = LightevalTaskConfig( name="wmt09:hu-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt09_hu-en", hf_avail_splits=["test"], @@ -19444,7 +19445,7 @@ wmt09_it_en_lighteval = LightevalTaskConfig( name="wmt09:it-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt09_it-en", hf_avail_splits=["test"], @@ -19462,7 +19463,7 @@ wmt10_cs_en_lighteval = LightevalTaskConfig( name="wmt10:cs-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt10_cs-en", hf_avail_splits=["test"], @@ -19480,7 +19481,7 @@ wmt10_de_en_lighteval = LightevalTaskConfig( name="wmt10:de-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt10_de-en", hf_avail_splits=["test"], @@ -19498,7 +19499,7 @@ wmt10_en_cs_lighteval = LightevalTaskConfig( name="wmt10:en-cs", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt10_en-cs", hf_avail_splits=["test"], @@ -19516,7 +19517,7 @@ wmt10_en_de_lighteval = LightevalTaskConfig( name="wmt10:en-de", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt10_en-de", hf_avail_splits=["test"], @@ -19534,7 +19535,7 @@ wmt10_en_es_lighteval = LightevalTaskConfig( name="wmt10:en-es", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt10_en-es", hf_avail_splits=["test"], @@ -19552,7 +19553,7 @@ wmt10_en_fr_lighteval = LightevalTaskConfig( name="wmt10:en-fr", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt10_en-fr", hf_avail_splits=["test"], @@ -19570,7 +19571,7 @@ wmt10_es_en_lighteval = LightevalTaskConfig( name="wmt10:es-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt10_es-en", hf_avail_splits=["test"], @@ -19588,7 +19589,7 @@ wmt10_fr_en_lighteval = LightevalTaskConfig( name="wmt10:fr-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt10_fr-en", hf_avail_splits=["test"], @@ -19606,7 +19607,7 @@ wmt11_cs_en_lighteval = LightevalTaskConfig( name="wmt11:cs-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt11_cs-en", hf_avail_splits=["test"], @@ -19624,7 +19625,7 @@ wmt11_de_en_lighteval = LightevalTaskConfig( name="wmt11:de-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt11_de-en", hf_avail_splits=["test"], @@ -19642,7 +19643,7 @@ wmt11_en_cs_lighteval = LightevalTaskConfig( name="wmt11:en-cs", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt11_en-cs", hf_avail_splits=["test"], @@ -19660,7 +19661,7 @@ wmt11_en_de_lighteval = LightevalTaskConfig( name="wmt11:en-de", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt11_en-de", hf_avail_splits=["test"], @@ -19678,7 +19679,7 @@ wmt11_en_es_lighteval = LightevalTaskConfig( name="wmt11:en-es", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt11_en-es", hf_avail_splits=["test"], @@ -19696,7 +19697,7 @@ wmt11_en_fr_lighteval = LightevalTaskConfig( name="wmt11:en-fr", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt11_en-fr", hf_avail_splits=["test"], @@ -19714,7 +19715,7 @@ wmt11_es_en_lighteval = LightevalTaskConfig( name="wmt11:es-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt11_es-en", hf_avail_splits=["test"], @@ -19732,7 +19733,7 @@ wmt11_fr_en_lighteval = LightevalTaskConfig( name="wmt11:fr-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt11_fr-en", hf_avail_splits=["test"], @@ -19750,7 +19751,7 @@ wmt12_cs_en_lighteval = LightevalTaskConfig( name="wmt12:cs-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt12_cs-en", hf_avail_splits=["test"], @@ -19768,7 +19769,7 @@ wmt12_de_en_lighteval = LightevalTaskConfig( name="wmt12:de-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt12_de-en", hf_avail_splits=["test"], @@ -19786,7 +19787,7 @@ wmt12_en_cs_lighteval = LightevalTaskConfig( name="wmt12:en-cs", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt12_en-cs", hf_avail_splits=["test"], @@ -19804,7 +19805,7 @@ wmt12_en_de_lighteval = LightevalTaskConfig( name="wmt12:en-de", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt12_en-de", hf_avail_splits=["test"], @@ -19822,7 +19823,7 @@ wmt12_en_es_lighteval = LightevalTaskConfig( name="wmt12:en-es", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt12_en-es", hf_avail_splits=["test"], @@ -19840,7 +19841,7 @@ wmt12_en_fr_lighteval = LightevalTaskConfig( name="wmt12:en-fr", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt12_en-fr", hf_avail_splits=["test"], @@ -19858,7 +19859,7 @@ wmt12_es_en_lighteval = LightevalTaskConfig( name="wmt12:es-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt12_es-en", hf_avail_splits=["test"], @@ -19876,7 +19877,7 @@ wmt12_fr_en_lighteval = LightevalTaskConfig( name="wmt12:fr-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt12_fr-en", hf_avail_splits=["test"], @@ -19894,7 +19895,7 @@ wmt13_cs_en_lighteval = LightevalTaskConfig( name="wmt13:cs-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt13_cs-en", hf_avail_splits=["test"], @@ -19912,7 +19913,7 @@ wmt13_de_en_lighteval = LightevalTaskConfig( name="wmt13:de-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt13_de-en", hf_avail_splits=["test"], @@ -19930,7 +19931,7 @@ wmt13_en_cs_lighteval = LightevalTaskConfig( name="wmt13:en-cs", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt13_en-cs", hf_avail_splits=["test"], @@ -19948,7 +19949,7 @@ wmt13_en_de_lighteval = LightevalTaskConfig( name="wmt13:en-de", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt13_en-de", hf_avail_splits=["test"], @@ -19966,7 +19967,7 @@ wmt13_en_es_lighteval = LightevalTaskConfig( name="wmt13:en-es", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt13_en-es", hf_avail_splits=["test"], @@ -19984,7 +19985,7 @@ wmt13_en_fr_lighteval = LightevalTaskConfig( name="wmt13:en-fr", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt13_en-fr", hf_avail_splits=["test"], @@ -20002,7 +20003,7 @@ wmt13_en_ru_lighteval = LightevalTaskConfig( name="wmt13:en-ru", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt13_en-ru", hf_avail_splits=["test"], @@ -20020,7 +20021,7 @@ wmt13_es_en_lighteval = LightevalTaskConfig( name="wmt13:es-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt13_es-en", hf_avail_splits=["test"], @@ -20038,7 +20039,7 @@ wmt13_fr_en_lighteval = LightevalTaskConfig( name="wmt13:fr-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt13_fr-en", hf_avail_splits=["test"], @@ -20056,7 +20057,7 @@ wmt13_ru_en_lighteval = LightevalTaskConfig( name="wmt13:ru-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt13_ru-en", hf_avail_splits=["test"], @@ -20074,7 +20075,7 @@ wmt14_cs_en_lighteval = LightevalTaskConfig( name="wmt14:cs-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt14_cs-en", hf_avail_splits=["test"], @@ -20092,7 +20093,7 @@ wmt14_de_en_lighteval = LightevalTaskConfig( name="wmt14:de-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt14_de-en", hf_avail_splits=["test"], @@ -20110,7 +20111,7 @@ wmt14_en_cs_lighteval = LightevalTaskConfig( name="wmt14:en-cs", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt14_en-cs", hf_avail_splits=["test"], @@ -20128,7 +20129,7 @@ wmt14_en_de_lighteval = LightevalTaskConfig( name="wmt14:en-de", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt14_en-de", hf_avail_splits=["test"], @@ -20146,7 +20147,7 @@ wmt14_en_fr_lighteval = LightevalTaskConfig( name="wmt14:en-fr", suite=["lighteval", "gpt3_benchmarks"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="wmt14", hf_subset="fr-en", hf_avail_splits=["train", "validation", "test"], @@ -20164,7 +20165,7 @@ wmt14_en_fr_lighteval = LightevalTaskConfig( name="wmt14:en-fr", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt14_en-fr", hf_avail_splits=["test"], @@ -20182,7 +20183,7 @@ wmt14_en_hi_lighteval = LightevalTaskConfig( name="wmt14:en-hi", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt14_en-hi", hf_avail_splits=["test"], @@ -20200,7 +20201,7 @@ wmt14_en_ru_lighteval = LightevalTaskConfig( name="wmt14:en-ru", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt14_en-ru", hf_avail_splits=["test"], @@ -20218,7 +20219,7 @@ wmt14_fr_en_lighteval = LightevalTaskConfig( name="wmt14:fr-en", suite=["lighteval", "gpt3_benchmarks"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="wmt14", hf_subset="fr-en", hf_avail_splits=["train", "validation", "test"], @@ -20236,7 +20237,7 @@ wmt14_fr_en_lighteval = LightevalTaskConfig( name="wmt14:fr-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt14_fr-en", hf_avail_splits=["test"], @@ -20254,7 +20255,7 @@ wmt14_hi_en_lighteval = LightevalTaskConfig( name="wmt14:hi-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt14_hi-en", hf_avail_splits=["test"], @@ -20272,7 +20273,7 @@ wmt14_ru_en_lighteval = LightevalTaskConfig( name="wmt14:ru-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt14_ru-en", hf_avail_splits=["test"], @@ -20290,7 +20291,7 @@ wmt14_cs_en_helm = LightevalTaskConfig( name="wmt14:cs-en", suite=["helm"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/wmt14", hf_subset="cs-en", hf_avail_splits=["train", "test", "validation"], @@ -20308,7 +20309,7 @@ wmt14_de_en_helm = LightevalTaskConfig( name="wmt14:de-en", suite=["helm"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/wmt14", hf_subset="de-en", hf_avail_splits=["train", "test", "validation"], @@ -20326,7 +20327,7 @@ wmt14_fr_en_helm = LightevalTaskConfig( name="wmt14:fr-en", suite=["helm"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/wmt14", hf_subset="fr-en", hf_avail_splits=["train", "test", "validation"], @@ -20344,7 +20345,7 @@ wmt14_hi_en_helm = LightevalTaskConfig( name="wmt14:hi-en", suite=["helm"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/wmt14", hf_subset="hi-en", hf_avail_splits=["train", "test", "validation"], @@ -20362,7 +20363,7 @@ wmt14_ru_en_helm = LightevalTaskConfig( name="wmt14:ru-en", suite=["helm"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/wmt14", hf_subset="ru-en", hf_avail_splits=["train", "test", "validation"], @@ -20380,7 +20381,7 @@ wmt15_cs_en_lighteval = LightevalTaskConfig( name="wmt15:cs-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt15_cs-en", hf_avail_splits=["test"], @@ -20398,7 +20399,7 @@ wmt15_de_en_lighteval = LightevalTaskConfig( name="wmt15:de-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt15_de-en", hf_avail_splits=["test"], @@ -20416,7 +20417,7 @@ wmt15_en_cs_lighteval = LightevalTaskConfig( name="wmt15:en-cs", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt15_en-cs", hf_avail_splits=["test"], @@ -20434,7 +20435,7 @@ wmt15_en_de_lighteval = LightevalTaskConfig( name="wmt15:en-de", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt15_en-de", hf_avail_splits=["test"], @@ -20452,7 +20453,7 @@ wmt15_en_fi_lighteval = LightevalTaskConfig( name="wmt15:en-fi", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt15_en-fi", hf_avail_splits=["test"], @@ -20470,7 +20471,7 @@ wmt15_en_fr_lighteval = LightevalTaskConfig( name="wmt15:en-fr", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt15_en-fr", hf_avail_splits=["test"], @@ -20488,7 +20489,7 @@ wmt15_en_ru_lighteval = LightevalTaskConfig( name="wmt15:en-ru", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt15_en-ru", hf_avail_splits=["test"], @@ -20506,7 +20507,7 @@ wmt15_fi_en_lighteval = LightevalTaskConfig( name="wmt15:fi-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt15_fi-en", hf_avail_splits=["test"], @@ -20524,7 +20525,7 @@ wmt15_fr_en_lighteval = LightevalTaskConfig( name="wmt15:fr-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt15_fr-en", hf_avail_splits=["test"], @@ -20542,7 +20543,7 @@ wmt15_ru_en_lighteval = LightevalTaskConfig( name="wmt15:ru-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt15_ru-en", hf_avail_splits=["test"], @@ -20560,7 +20561,7 @@ wmt16_cs_en_lighteval = LightevalTaskConfig( name="wmt16:cs-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt16_cs-en", hf_avail_splits=["test"], @@ -20578,7 +20579,7 @@ wmt16_de_en_lighteval = LightevalTaskConfig( name="wmt16:de-en", suite=["lighteval", "gpt3_benchmarks"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="wmt16", hf_subset="de-en", hf_avail_splits=["train", "validation", "test"], @@ -20596,7 +20597,7 @@ wmt16_de_en_lighteval = LightevalTaskConfig( name="wmt16:de-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt16_de-en", hf_avail_splits=["test"], @@ -20614,7 +20615,7 @@ wmt16_en_cs_lighteval = LightevalTaskConfig( name="wmt16:en-cs", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt16_en-cs", hf_avail_splits=["test"], @@ -20632,7 +20633,7 @@ wmt16_en_de_lighteval = LightevalTaskConfig( name="wmt16:en-de", suite=["lighteval", "gpt3_benchmarks"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="wmt16", hf_subset="de-en", hf_avail_splits=["train", "validation", "test"], @@ -20650,7 +20651,7 @@ wmt16_en_de_lighteval = LightevalTaskConfig( name="wmt16:en-de", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt16_en-de", hf_avail_splits=["test"], @@ -20668,7 +20669,7 @@ wmt16_en_fi_lighteval = LightevalTaskConfig( name="wmt16:en-fi", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt16_en-fi", hf_avail_splits=["test"], @@ -20686,7 +20687,7 @@ wmt16_en_ro_lighteval = LightevalTaskConfig( name="wmt16:en-ro", suite=["lighteval", "gpt3_benchmarks"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="wmt16", hf_subset="ro-en", hf_avail_splits=["train", "validation", "test"], @@ -20704,7 +20705,7 @@ wmt16_en_ro_lighteval = LightevalTaskConfig( name="wmt16:en-ro", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt16_en-ro", hf_avail_splits=["test"], @@ -20722,7 +20723,7 @@ wmt16_en_ru_lighteval = LightevalTaskConfig( name="wmt16:en-ru", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt16_en-ru", hf_avail_splits=["test"], @@ -20740,7 +20741,7 @@ wmt16_en_tr_lighteval = LightevalTaskConfig( name="wmt16:en-tr", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt16_en-tr", hf_avail_splits=["test"], @@ -20758,7 +20759,7 @@ wmt16_fi_en_lighteval = LightevalTaskConfig( name="wmt16:fi-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt16_fi-en", hf_avail_splits=["test"], @@ -20776,7 +20777,7 @@ wmt16_ro_en_lighteval = LightevalTaskConfig( name="wmt16:ro-en", suite=["lighteval", "gpt3_benchmarks"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="wmt16", hf_subset="ro-en", hf_avail_splits=["train", "validation", "test"], @@ -20794,7 +20795,7 @@ wmt16_ro_en_lighteval = LightevalTaskConfig( name="wmt16:ro-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt16_ro-en", hf_avail_splits=["test"], @@ -20812,7 +20813,7 @@ wmt16_ru_en_lighteval = LightevalTaskConfig( name="wmt16:ru-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt16_ru-en", hf_avail_splits=["test"], @@ -20830,7 +20831,7 @@ wmt16_tr_en_lighteval = LightevalTaskConfig( name="wmt16:tr-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt16_tr-en", hf_avail_splits=["test"], @@ -20848,7 +20849,7 @@ wmt17_cs_en_lighteval = LightevalTaskConfig( name="wmt17:cs-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt17_cs-en", hf_avail_splits=["test"], @@ -20866,7 +20867,7 @@ wmt17_de_en_lighteval = LightevalTaskConfig( name="wmt17:de-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt17_de-en", hf_avail_splits=["test"], @@ -20884,7 +20885,7 @@ wmt17_en_cs_lighteval = LightevalTaskConfig( name="wmt17:en-cs", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt17_en-cs", hf_avail_splits=["test"], @@ -20902,7 +20903,7 @@ wmt17_en_de_lighteval = LightevalTaskConfig( name="wmt17:en-de", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt17_en-de", hf_avail_splits=["test"], @@ -20920,7 +20921,7 @@ wmt17_en_fi_lighteval = LightevalTaskConfig( name="wmt17:en-fi", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt17_en-fi", hf_avail_splits=["test"], @@ -20938,7 +20939,7 @@ wmt17_en_lv_lighteval = LightevalTaskConfig( name="wmt17:en-lv", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt17_en-lv", hf_avail_splits=["test"], @@ -20956,7 +20957,7 @@ wmt17_en_ru_lighteval = LightevalTaskConfig( name="wmt17:en-ru", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt17_en-ru", hf_avail_splits=["test"], @@ -20974,7 +20975,7 @@ wmt17_en_tr_lighteval = LightevalTaskConfig( name="wmt17:en-tr", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt17_en-tr", hf_avail_splits=["test"], @@ -20992,7 +20993,7 @@ wmt17_en_zh_lighteval = LightevalTaskConfig( name="wmt17:en-zh", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt17_en-zh", hf_avail_splits=["test"], @@ -21010,7 +21011,7 @@ wmt17_fi_en_lighteval = LightevalTaskConfig( name="wmt17:fi-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt17_fi-en", hf_avail_splits=["test"], @@ -21028,7 +21029,7 @@ wmt17_lv_en_lighteval = LightevalTaskConfig( name="wmt17:lv-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt17_lv-en", hf_avail_splits=["test"], @@ -21046,7 +21047,7 @@ wmt17_ru_en_lighteval = LightevalTaskConfig( name="wmt17:ru-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt17_ru-en", hf_avail_splits=["test"], @@ -21064,7 +21065,7 @@ wmt17_tr_en_lighteval = LightevalTaskConfig( name="wmt17:tr-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt17_tr-en", hf_avail_splits=["test"], @@ -21082,7 +21083,7 @@ wmt17_zh_en_lighteval = LightevalTaskConfig( name="wmt17:zh-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt17_zh-en", hf_avail_splits=["test"], @@ -21100,7 +21101,7 @@ wmt18_cs_en_lighteval = LightevalTaskConfig( name="wmt18:cs-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt18_cs-en", hf_avail_splits=["test"], @@ -21118,7 +21119,7 @@ wmt18_de_en_lighteval = LightevalTaskConfig( name="wmt18:de-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt18_de-en", hf_avail_splits=["test"], @@ -21136,7 +21137,7 @@ wmt18_en_cs_lighteval = LightevalTaskConfig( name="wmt18:en-cs", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt18_en-cs", hf_avail_splits=["test"], @@ -21154,7 +21155,7 @@ wmt18_en_de_lighteval = LightevalTaskConfig( name="wmt18:en-de", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt18_en-de", hf_avail_splits=["test"], @@ -21172,7 +21173,7 @@ wmt18_en_et_lighteval = LightevalTaskConfig( name="wmt18:en-et", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt18_en-et", hf_avail_splits=["test"], @@ -21190,7 +21191,7 @@ wmt18_en_fi_lighteval = LightevalTaskConfig( name="wmt18:en-fi", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt18_en-fi", hf_avail_splits=["test"], @@ -21208,7 +21209,7 @@ wmt18_en_ru_lighteval = LightevalTaskConfig( name="wmt18:en-ru", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt18_en-ru", hf_avail_splits=["test"], @@ -21226,7 +21227,7 @@ wmt18_en_tr_lighteval = LightevalTaskConfig( name="wmt18:en-tr", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt18_en-tr", hf_avail_splits=["test"], @@ -21244,7 +21245,7 @@ wmt18_en_zh_lighteval = LightevalTaskConfig( name="wmt18:en-zh", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt18_en-zh", hf_avail_splits=["test"], @@ -21262,7 +21263,7 @@ wmt18_et_en_lighteval = LightevalTaskConfig( name="wmt18:et-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt18_et-en", hf_avail_splits=["test"], @@ -21280,7 +21281,7 @@ wmt18_fi_en_lighteval = LightevalTaskConfig( name="wmt18:fi-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt18_fi-en", hf_avail_splits=["test"], @@ -21298,7 +21299,7 @@ wmt18_ru_en_lighteval = LightevalTaskConfig( name="wmt18:ru-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt18_ru-en", hf_avail_splits=["test"], @@ -21316,7 +21317,7 @@ wmt18_tr_en_lighteval = LightevalTaskConfig( name="wmt18:tr-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt18_tr-en", hf_avail_splits=["test"], @@ -21334,7 +21335,7 @@ wmt18_zh_en_lighteval = LightevalTaskConfig( name="wmt18:zh-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt18_zh-en", hf_avail_splits=["test"], @@ -21352,7 +21353,7 @@ wmt19_cs_de_lighteval = LightevalTaskConfig( name="wmt19:cs-de", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt19_cs-de", hf_avail_splits=["test"], @@ -21370,7 +21371,7 @@ wmt19_de_cs_lighteval = LightevalTaskConfig( name="wmt19:de-cs", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt19_de-cs", hf_avail_splits=["test"], @@ -21388,7 +21389,7 @@ wmt19_de_en_lighteval = LightevalTaskConfig( name="wmt19:de-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt19_de-en", hf_avail_splits=["test"], @@ -21406,7 +21407,7 @@ wmt19_de_fr_lighteval = LightevalTaskConfig( name="wmt19:de-fr", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt19_de-fr", hf_avail_splits=["test"], @@ -21424,7 +21425,7 @@ wmt19_en_cs_lighteval = LightevalTaskConfig( name="wmt19:en-cs", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt19_en-cs", hf_avail_splits=["test"], @@ -21442,7 +21443,7 @@ wmt19_en_de_lighteval = LightevalTaskConfig( name="wmt19:en-de", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt19_en-de", hf_avail_splits=["test"], @@ -21460,7 +21461,7 @@ wmt19_en_fi_lighteval = LightevalTaskConfig( name="wmt19:en-fi", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt19_en-fi", hf_avail_splits=["test"], @@ -21478,7 +21479,7 @@ wmt19_en_gu_lighteval = LightevalTaskConfig( name="wmt19:en-gu", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt19_en-gu", hf_avail_splits=["test"], @@ -21496,7 +21497,7 @@ wmt19_en_kk_lighteval = LightevalTaskConfig( name="wmt19:en-kk", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt19_en-kk", hf_avail_splits=["test"], @@ -21514,7 +21515,7 @@ wmt19_en_lt_lighteval = LightevalTaskConfig( name="wmt19:en-lt", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt19_en-lt", hf_avail_splits=["test"], @@ -21532,7 +21533,7 @@ wmt19_en_ru_lighteval = LightevalTaskConfig( name="wmt19:en-ru", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt19_en-ru", hf_avail_splits=["test"], @@ -21550,7 +21551,7 @@ wmt19_en_zh_lighteval = LightevalTaskConfig( name="wmt19:en-zh", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt19_en-zh", hf_avail_splits=["test"], @@ -21568,7 +21569,7 @@ wmt19_fi_en_lighteval = LightevalTaskConfig( name="wmt19:fi-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt19_fi-en", hf_avail_splits=["test"], @@ -21586,7 +21587,7 @@ wmt19_fr_de_lighteval = LightevalTaskConfig( name="wmt19:fr-de", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt19_fr-de", hf_avail_splits=["test"], @@ -21604,7 +21605,7 @@ wmt19_gu_en_lighteval = LightevalTaskConfig( name="wmt19:gu-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt19_gu-en", hf_avail_splits=["test"], @@ -21622,7 +21623,7 @@ wmt19_kk_en_lighteval = LightevalTaskConfig( name="wmt19:kk-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt19_kk-en", hf_avail_splits=["test"], @@ -21640,7 +21641,7 @@ wmt19_lt_en_lighteval = LightevalTaskConfig( name="wmt19:lt-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt19_lt-en", hf_avail_splits=["test"], @@ -21658,7 +21659,7 @@ wmt19_ru_en_lighteval = LightevalTaskConfig( name="wmt19:ru-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt19_ru-en", hf_avail_splits=["test"], @@ -21676,7 +21677,7 @@ wmt19_zh_en_lighteval = LightevalTaskConfig( name="wmt19:zh-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt19_zh-en", hf_avail_splits=["test"], @@ -21694,7 +21695,7 @@ wmt20_cs_en_lighteval = LightevalTaskConfig( name="wmt20:cs-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt20_cs-en", hf_avail_splits=["test"], @@ -21712,7 +21713,7 @@ wmt20_de_en_lighteval = LightevalTaskConfig( name="wmt20:de-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt20_de-en", hf_avail_splits=["test"], @@ -21730,7 +21731,7 @@ wmt20_de_fr_lighteval = LightevalTaskConfig( name="wmt20:de-fr", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt20_de-fr", hf_avail_splits=["test"], @@ -21748,7 +21749,7 @@ wmt20_en_cs_lighteval = LightevalTaskConfig( name="wmt20:en-cs", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt20_en-cs", hf_avail_splits=["test"], @@ -21766,7 +21767,7 @@ wmt20_en_de_lighteval = LightevalTaskConfig( name="wmt20:en-de", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt20_en-de", hf_avail_splits=["test"], @@ -21784,7 +21785,7 @@ wmt20_en_iu_lighteval = LightevalTaskConfig( name="wmt20:en-iu", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt20_en-iu", hf_avail_splits=["test"], @@ -21802,7 +21803,7 @@ wmt20_en_ja_lighteval = LightevalTaskConfig( name="wmt20:en-ja", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt20_en-ja", hf_avail_splits=["test"], @@ -21820,7 +21821,7 @@ wmt20_en_km_lighteval = LightevalTaskConfig( name="wmt20:en-km", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt20_en-km", hf_avail_splits=["test"], @@ -21838,7 +21839,7 @@ wmt20_en_pl_lighteval = LightevalTaskConfig( name="wmt20:en-pl", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt20_en-pl", hf_avail_splits=["test"], @@ -21856,7 +21857,7 @@ wmt20_en_ps_lighteval = LightevalTaskConfig( name="wmt20:en-ps", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt20_en-ps", hf_avail_splits=["test"], @@ -21874,7 +21875,7 @@ wmt20_en_ru_lighteval = LightevalTaskConfig( name="wmt20:en-ru", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt20_en-ru", hf_avail_splits=["test"], @@ -21892,7 +21893,7 @@ wmt20_en_ta_lighteval = LightevalTaskConfig( name="wmt20:en-ta", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt20_en-ta", hf_avail_splits=["test"], @@ -21910,7 +21911,7 @@ wmt20_en_zh_lighteval = LightevalTaskConfig( name="wmt20:en-zh", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_alphabetical", + prompt_function=prompt.wmt_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt20_en-zh", hf_avail_splits=["test"], @@ -21928,7 +21929,7 @@ wmt20_fr_de_lighteval = LightevalTaskConfig( name="wmt20:fr-de", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt20_fr-de", hf_avail_splits=["test"], @@ -21946,7 +21947,7 @@ wmt20_iu_en_lighteval = LightevalTaskConfig( name="wmt20:iu-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt20_iu-en", hf_avail_splits=["test"], @@ -21964,7 +21965,7 @@ wmt20_ja_en_lighteval = LightevalTaskConfig( name="wmt20:ja-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt20_ja-en", hf_avail_splits=["test"], @@ -21982,7 +21983,7 @@ wmt20_km_en_lighteval = LightevalTaskConfig( name="wmt20:km-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt20_km-en", hf_avail_splits=["test"], @@ -22000,7 +22001,7 @@ wmt20_pl_en_lighteval = LightevalTaskConfig( name="wmt20:pl-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt20_pl-en", hf_avail_splits=["test"], @@ -22018,7 +22019,7 @@ wmt20_ps_en_lighteval = LightevalTaskConfig( name="wmt20:ps-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt20_ps-en", hf_avail_splits=["test"], @@ -22036,7 +22037,7 @@ wmt20_ru_en_lighteval = LightevalTaskConfig( name="wmt20:ru-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt20_ru-en", hf_avail_splits=["test"], @@ -22054,7 +22055,7 @@ wmt20_ta_en_lighteval = LightevalTaskConfig( name="wmt20:ta-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt20_ta-en", hf_avail_splits=["test"], @@ -22072,7 +22073,7 @@ wmt20_zh_en_lighteval = LightevalTaskConfig( name="wmt20:zh-en", suite=["lighteval", "sacrebleu"], - prompt_function="wmt_reverse_alphabetical", + prompt_function=prompt.wmt_reverse_alphabetical, hf_repo="lighteval/sacrebleu_manual", hf_subset="wmt20_zh-en", hf_avail_splits=["test"], @@ -22090,7 +22091,7 @@ word_sorting_bigbench = LightevalTaskConfig( name="word_sorting", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="word_sorting", hf_avail_splits=["default", "train", "validation"], @@ -22108,7 +22109,7 @@ word_unscrambling_bigbench = LightevalTaskConfig( name="word_unscrambling", suite=["bigbench", "bigbench_json"], - prompt_function="bigbench", + prompt_function=prompt.bigbench, hf_repo="bigbench", hf_subset="word_unscrambling", hf_avail_splits=["default", "train", "validation"], @@ -22126,7 +22127,7 @@ wsc273_lighteval = LightevalTaskConfig( name="wsc273", suite=["lighteval"], - prompt_function="wsc273", + prompt_function=prompt.wsc273, hf_repo="winograd_wsc", hf_subset="wsc273", hf_avail_splits=["test"], @@ -22144,7 +22145,7 @@ xcopa_en_lighteval = LightevalTaskConfig( name="xcopa:en", suite=["lighteval"], - prompt_function="xcopa_en", + prompt_function=prompt.xcopa_en, hf_repo="xcopa", hf_subset="default", hf_avail_splits=["test", "train", "validation"], @@ -22162,7 +22163,7 @@ xcopa_et_lighteval = LightevalTaskConfig( name="xcopa:et", suite=["lighteval"], - prompt_function="xcopa_et", + prompt_function=prompt.xcopa_et, hf_repo="xcopa", hf_subset="et", hf_avail_splits=["test", "train", "validation"], @@ -22180,7 +22181,7 @@ xcopa_ht_lighteval = LightevalTaskConfig( name="xcopa:ht", suite=["lighteval"], - prompt_function="xcopa_ht", + prompt_function=prompt.xcopa_ht, hf_repo="xcopa", hf_subset="ht", hf_avail_splits=["test", "train", "validation"], @@ -22198,7 +22199,7 @@ xcopa_it_lighteval = LightevalTaskConfig( name="xcopa:it", suite=["lighteval"], - prompt_function="xcopa_it", + prompt_function=prompt.xcopa_it, hf_repo="xcopa", hf_subset="it", hf_avail_splits=["test", "train", "validation"], @@ -22216,7 +22217,7 @@ xcopa_id_lighteval = LightevalTaskConfig( name="xcopa:id", suite=["lighteval"], - prompt_function="xcopa_id", + prompt_function=prompt.xcopa_id, hf_repo="xcopa", hf_subset="id", hf_avail_splits=["test", "train", "validation"], @@ -22234,7 +22235,7 @@ xcopa_qu_lighteval = LightevalTaskConfig( name="xcopa:qu", suite=["lighteval"], - prompt_function="xcopa_qu", + prompt_function=prompt.xcopa_qu, hf_repo="xcopa", hf_subset="qu", hf_avail_splits=["test", "train", "validation"], @@ -22252,7 +22253,7 @@ xcopa_sw_lighteval = LightevalTaskConfig( name="xcopa:sw", suite=["lighteval"], - prompt_function="xcopa_sw", + prompt_function=prompt.xcopa_sw, hf_repo="xcopa", hf_subset="sw", hf_avail_splits=["test", "train", "validation"], @@ -22270,7 +22271,7 @@ xcopa_zh_lighteval = LightevalTaskConfig( name="xcopa:zh", suite=["lighteval"], - prompt_function="xcopa_zh", + prompt_function=prompt.xcopa_zh, hf_repo="xcopa", hf_subset="zh", hf_avail_splits=["test", "train", "validation"], @@ -22288,7 +22289,7 @@ xcopa_ta_lighteval = LightevalTaskConfig( name="xcopa:ta", suite=["lighteval"], - prompt_function="xcopa_ta", + prompt_function=prompt.xcopa_ta, hf_repo="xcopa", hf_subset="ta", hf_avail_splits=["test", "train", "validation"], @@ -22306,7 +22307,7 @@ xcopa_th_lighteval = LightevalTaskConfig( name="xcopa:th", suite=["lighteval"], - prompt_function="xcopa_th", + prompt_function=prompt.xcopa_th, hf_repo="xcopa", hf_subset="th", hf_avail_splits=["test", "train", "validation"], @@ -22324,7 +22325,7 @@ xcopa_tr_lighteval = LightevalTaskConfig( name="xcopa:tr", suite=["lighteval"], - prompt_function="xcopa_tr", + prompt_function=prompt.xcopa_tr, hf_repo="xcopa", hf_subset="tr", hf_avail_splits=["test", "train", "validation"], @@ -22342,7 +22343,7 @@ xcopa_vi_lighteval = LightevalTaskConfig( name="xcopa:vi", suite=["lighteval"], - prompt_function="xcopa_vi", + prompt_function=prompt.xcopa_vi, hf_repo="xcopa", hf_subset="vi", hf_avail_splits=["test", "train", "validation"], @@ -22360,7 +22361,7 @@ xstory_cloze_en_lighteval = LightevalTaskConfig( name="xstory_cloze:en", suite=["lighteval"], - prompt_function="storycloze", + prompt_function=prompt.storycloze, hf_repo="juletxara/xstory_cloze", hf_subset="en", hf_avail_splits=["training", "eval"], @@ -22378,7 +22379,7 @@ xstory_cloze_ru_lighteval = LightevalTaskConfig( name="xstory_cloze:ru", suite=["lighteval"], - prompt_function="storycloze", + prompt_function=prompt.storycloze, hf_repo="juletxara/xstory_cloze", hf_subset="ru", hf_avail_splits=["training", "eval"], @@ -22396,7 +22397,7 @@ xstory_cloze_zh_lighteval = LightevalTaskConfig( name="xstory_cloze:zh", suite=["lighteval"], - prompt_function="storycloze", + prompt_function=prompt.storycloze, hf_repo="juletxara/xstory_cloze", hf_subset="zh", hf_avail_splits=["training", "eval"], @@ -22414,7 +22415,7 @@ xstory_cloze_es_lighteval = LightevalTaskConfig( name="xstory_cloze:es", suite=["lighteval"], - prompt_function="storycloze", + prompt_function=prompt.storycloze, hf_repo="juletxara/xstory_cloze", hf_subset="es", hf_avail_splits=["training", "eval"], @@ -22432,7 +22433,7 @@ xstory_cloze_ar_lighteval = LightevalTaskConfig( name="xstory_cloze:ar", suite=["lighteval"], - prompt_function="storycloze", + prompt_function=prompt.storycloze, hf_repo="juletxara/xstory_cloze", hf_subset="ar", hf_avail_splits=["training", "eval"], @@ -22450,7 +22451,7 @@ xstory_cloze_hi_lighteval = LightevalTaskConfig( name="xstory_cloze:hi", suite=["lighteval"], - prompt_function="storycloze", + prompt_function=prompt.storycloze, hf_repo="juletxara/xstory_cloze", hf_subset="hi", hf_avail_splits=["training", "eval"], @@ -22468,7 +22469,7 @@ xstory_cloze_id_lighteval = LightevalTaskConfig( name="xstory_cloze:id", suite=["lighteval"], - prompt_function="storycloze", + prompt_function=prompt.storycloze, hf_repo="juletxara/xstory_cloze", hf_subset="id", hf_avail_splits=["training", "eval"], @@ -22486,7 +22487,7 @@ xstory_cloze_te_lighteval = LightevalTaskConfig( name="xstory_cloze:te", suite=["lighteval"], - prompt_function="storycloze", + prompt_function=prompt.storycloze, hf_repo="juletxara/xstory_cloze", hf_subset="te", hf_avail_splits=["training", "eval"], @@ -22504,7 +22505,7 @@ xstory_cloze_sw_lighteval = LightevalTaskConfig( name="xstory_cloze:sw", suite=["lighteval"], - prompt_function="storycloze", + prompt_function=prompt.storycloze, hf_repo="juletxara/xstory_cloze", hf_subset="sw", hf_avail_splits=["training", "eval"], @@ -22522,7 +22523,7 @@ xstory_cloze_eu_lighteval = LightevalTaskConfig( name="xstory_cloze:eu", suite=["lighteval"], - prompt_function="storycloze", + prompt_function=prompt.storycloze, hf_repo="juletxara/xstory_cloze", hf_subset="eu", hf_avail_splits=["training", "eval"], @@ -22540,7 +22541,7 @@ xstory_cloze_my_lighteval = LightevalTaskConfig( name="xstory_cloze:my", suite=["lighteval"], - prompt_function="storycloze", + prompt_function=prompt.storycloze, hf_repo="juletxara/xstory_cloze", hf_subset="my", hf_avail_splits=["training", "eval"], @@ -22558,7 +22559,7 @@ xwinograd_en_lighteval = LightevalTaskConfig( name="xwinograd:en", suite=["lighteval"], - prompt_function="winogrande", + prompt_function=prompt.winogrande, hf_repo="Muennighoff/xwinograd", hf_subset="en", hf_avail_splits=["test"], @@ -22576,7 +22577,7 @@ xwinograd_fr_lighteval = LightevalTaskConfig( name="xwinograd:fr", suite=["lighteval"], - prompt_function="winogrande", + prompt_function=prompt.winogrande, hf_repo="Muennighoff/xwinograd", hf_subset="fr", hf_avail_splits=["test"], @@ -22594,7 +22595,7 @@ xwinograd_jp_lighteval = LightevalTaskConfig( name="xwinograd:jp", suite=["lighteval"], - prompt_function="winogrande", + prompt_function=prompt.winogrande, hf_repo="Muennighoff/xwinograd", hf_subset="jp", hf_avail_splits=["test"], @@ -22612,7 +22613,7 @@ xwinograd_pt_lighteval = LightevalTaskConfig( name="xwinograd:pt", suite=["lighteval"], - prompt_function="winogrande", + prompt_function=prompt.winogrande, hf_repo="Muennighoff/xwinograd", hf_subset="pt", hf_avail_splits=["test"], @@ -22630,7 +22631,7 @@ xwinograd_ru_lighteval = LightevalTaskConfig( name="xwinograd:ru", suite=["lighteval"], - prompt_function="winogrande", + prompt_function=prompt.winogrande, hf_repo="Muennighoff/xwinograd", hf_subset="ru", hf_avail_splits=["test"], @@ -22648,7 +22649,7 @@ xwinograd_zh_lighteval = LightevalTaskConfig( name="xwinograd:zh", suite=["lighteval"], - prompt_function="winogrande", + prompt_function=prompt.winogrande, hf_repo="Muennighoff/xwinograd", hf_subset="zh", hf_avail_splits=["test"], diff --git a/src/lighteval/tasks/extended/ifeval/main.py b/src/lighteval/tasks/extended/ifeval/main.py index c7290c3f..b3274d50 100644 --- a/src/lighteval/tasks/extended/ifeval/main.py +++ b/src/lighteval/tasks/extended/ifeval/main.py @@ -34,10 +34,22 @@ from lighteval.tasks.requests import Doc +# Very specific task where there are no precise outputs but instead we test if the format obeys rules +def ifeval_prompt(line, task_name: str = None): + return Doc( + task_name=task_name, + query=line["prompt"], + choices=[""], + gold_index=0, + instruction="", + specific={"instructions_id_list": line["instruction_id_list"], "kwargs": line["kwargs"]}, + ) + + # We create the task config ifeval = LightevalTaskConfig( name="ifeval", - prompt_function="ifeval_prompt", + prompt_function=ifeval_prompt, suite=["extended"], hf_repo="wis-k/instruction-following-eval", hf_subset="default", @@ -51,18 +63,6 @@ ) -# very specific task where there are no precise outputs but instead we test if the format obeys rules -def ifeval_prompt(line, task_name: str = None): - return Doc( - task_name=task_name, - query=line["prompt"], - choices=[""], - gold_index=0, - instruction="", - specific={"instructions_id_list": line["instruction_id_list"], "kwargs": line["kwargs"]}, - ) - - submetric_names = [ "prompt_level_strict_acc", "inst_level_strict_acc", diff --git a/src/lighteval/tasks/extended/mt_bench/main.py b/src/lighteval/tasks/extended/mt_bench/main.py index 4dfdeb41..77b8f3ee 100644 --- a/src/lighteval/tasks/extended/mt_bench/main.py +++ b/src/lighteval/tasks/extended/mt_bench/main.py @@ -21,34 +21,8 @@ # SOFTWARE. # ruff: noqa: F405, F403, F401, I001 - -import numpy as np -from aenum import extend_enum -from transformers import AutoModelForCausalLM, AutoTokenizer - -from lighteval.metrics import Metrics -from lighteval.metrics.utils import MetricCategory, MetricUseCase, SampleLevelMetric, SampleLevelMetricGrouping from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.requests import Doc -from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES -from colorama import Fore, Style -import os - - -task = LightevalTaskConfig( - name="mt_bench", - prompt_function="mt_bench_prompt", # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py - suite=["extended"], - hf_repo="lighteval/mt-bench", - hf_subset="default", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split="", - few_shots_select="random", - metric=["llm_judge_multi_turn_openai"], - generation_size=1024, - stop_sequence=[], -) def mt_bench_prompt(line, task_name: str = None): @@ -71,6 +45,22 @@ def mt_bench_prompt(line, task_name: str = None): ) +task = LightevalTaskConfig( + name="mt_bench", + prompt_function=mt_bench_prompt, # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py + suite=["extended"], + hf_repo="lighteval/mt-bench", + hf_subset="default", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split="", + few_shots_select="random", + metric=["llm_judge_multi_turn_openai"], + generation_size=1024, + stop_sequence=[], +) + + TASKS_TABLE = [task] if __name__ == "__main__": diff --git a/src/lighteval/tasks/extended/tiny_benchmarks/main.py b/src/lighteval/tasks/extended/tiny_benchmarks/main.py index a8ce41a3..866b5924 100644 --- a/src/lighteval/tasks/extended/tiny_benchmarks/main.py +++ b/src/lighteval/tasks/extended/tiny_benchmarks/main.py @@ -35,6 +35,7 @@ from aenum import extend_enum from scipy.optimize import minimize +import lighteval.tasks.tasks_prompt_formatting as prompt from lighteval.metrics import Metrics from lighteval.metrics.metrics import CorpusLevelMetricGrouping from lighteval.metrics.metrics_sample import ExactMatches, LoglikelihoodAcc @@ -186,7 +187,7 @@ def aggregate(self, y_input): "name": "winogrande", "dataset": "tinyBenchmarks/tinyWinogrande", "subset": "winogrande_xl", - "prompt": "winogrande", + "prompt": prompt.winogrande, "splits": ["train", "validation", "test"], "evaluation_split": ["validation"], }, @@ -194,7 +195,7 @@ def aggregate(self, y_input): "name": "arc", "dataset": "tinyBenchmarks/tinyAI2_arc", "subset": "ARC-Challenge", - "prompt": "arc", + "prompt": prompt.arc, "splits": ["train", "validation", "test"], "evaluation_split": ["validation"], }, @@ -202,7 +203,7 @@ def aggregate(self, y_input): "name": "hellaswag", "dataset": "tinyBenchmarks/tinyHellaswag", "subset": "default", - "prompt": "hellaswag_harness", + "prompt": prompt.hellaswag_harness, "splits": ["train", "validation", "test"], "evaluation_split": ["validation"], }, @@ -210,7 +211,7 @@ def aggregate(self, y_input): "name": "mmlu", "dataset": "tinyBenchmarks/tinyMMLU", "subset": "all", - "prompt": "mmlu_harness", + "prompt": prompt.mmlu_harness, "splits": ["validation", "dev", "test"], "evaluation_split": ["test"], }, @@ -218,7 +219,7 @@ def aggregate(self, y_input): "name": "truthfulqa", "dataset": "tinyBenchmarks/tinyTruthfulQA", "subset": "multiple_choice", - "prompt": "truthful_qa_multiple_choice", + "prompt": prompt.truthful_qa_multiple_choice, "splits": ["validation"], "evaluation_split": ["validation"], }, @@ -226,7 +227,7 @@ def aggregate(self, y_input): "name": "gsm8k", "dataset": "tinyBenchmarks/tinyGSM8k", "subset": "main", - "prompt": "gsm8k", + "prompt": prompt.gsm8k, "splits": ["train", "test"], "evaluation_split": ["test"], }, diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py index fa1b1d5a..b92cb8fa 100644 --- a/src/lighteval/tasks/lighteval_task.py +++ b/src/lighteval/tasks/lighteval_task.py @@ -26,11 +26,10 @@ from dataclasses import dataclass from multiprocessing import Pool from pathlib import Path -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Union from datasets import load_dataset -import lighteval.tasks.tasks_prompt_formatting as tasks_prompt_formatting from lighteval.few_shot_manager import FewShotSampler from lighteval.logging.hierarchical_logger import hlog, hlog_warn from lighteval.metrics import ( @@ -69,7 +68,7 @@ class LightevalTaskConfig: Arguments: name (str): Short name of the evaluation task. suite (list[str]): Evaluation suites to which the task belongs. - prompt_function (str): Name of the function used to create the [`Doc`] samples from each line of the evaluation dataset. + prompt_function (Callable[[dict, str], Doc]): Function used to create the [`Doc`] samples from each line of the evaluation dataset. hf_repo (str): Path of the hub dataset repository containing the evaluation information. hf_subset (str): Subset used for the current task, will be default if none is selected. hf_avail_splits (list[str]): All the available splits in the evaluation dataset @@ -89,7 +88,7 @@ class LightevalTaskConfig: """ name: str - prompt_function: str + prompt_function: Callable # [[dict, str], Doc] hf_repo: str hf_subset: str metric: Tuple[Union[str, Metrics]] @@ -203,31 +202,12 @@ def __init__( # noqa: C901 self.num_samples = [1] + [ int(metric.replace("maj_at_", "").split("_")[0]) for metric in self.metrics if "maj_at_" in metric ] + if not isinstance(cfg.prompt_function, Callable): + raise TypeError( + f"Prompt formatting function ({str(cfg.prompt_function)}) should have been passed as a callable, was {type(cfg.prompt_function)} instead." + ) + self.formatter = cfg.prompt_function - # Data processing - # to use once prompt formatting is managed as a module - if custom_tasks_module is None: - self.formatter = getattr(tasks_prompt_formatting, cfg.prompt_function) - else: - formatter = [] - for module in custom_tasks_module: - if hasattr(module, cfg.prompt_function): - formatter.append(getattr(module, cfg.prompt_function)) - - if len(formatter) == 0: # Default version - self.formatter = getattr(tasks_prompt_formatting, cfg.prompt_function) - elif len(formatter) == 1: - # If we have a prompt in both the module and our tasks_prompt_formatting - # We take the prompt from the module - if hasattr(tasks_prompt_formatting, cfg.prompt_function): - hlog_warn( - f"Be careful you are using custom prompt function {cfg.prompt_function} and not the default one." - ) - self.formatter = formatter[0] - else: - raise Exception( - f"You defined the prompt function {cfg.prompt_function} several times in the different custom modules you are loading." - ) self.generation_size = cfg.generation_size self.stop_sequence = cfg.stop_sequence self.output_regex = cfg.output_regex