From 9b3813ffc207273b3783b6c1505f9411fb45d612 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Fourrier?= <22726840+clefourrier@users.noreply.github.com> Date: Wed, 6 Mar 2024 13:17:06 +0100 Subject: [PATCH] Change the eos condition for GSM8K (#85) We now cover both leaderboard|GSM8K to reproduce the leaderboard's score and lighteval|GSM8K --- README.md | 2 +- run_evals_accelerate.py | 2 +- src/lighteval/logging/evaluation_tracker.py | 2 +- src/lighteval/tasks/registry.py | 9 +- src/lighteval/tasks/tasks_table.jsonl | 125 +++++++++--------- tasks_examples/all_tasks.txt | 124 ++++++++--------- tasks_examples/open_llm_leaderboard_tasks.txt | 124 ++++++++--------- tasks_examples/recommended_set.txt | 122 ++++++++--------- .../reference_scores/reference_task_scores.py | 65 +++++---- tests/reference_scores/reference_tasks.py | 24 ++-- 10 files changed, 301 insertions(+), 298 deletions(-) diff --git a/README.md b/README.md index 5faafce3a..7cf415434 100644 --- a/README.md +++ b/README.md @@ -113,7 +113,7 @@ Here, `--override_batch_size` defines the _batch size per device_, so the effect ```shell accelerate launch --multi_gpu --num_processes=8 run_evals_accelerate.py \ --model_args "pretrained=gpt2" \ - --tasks "lighteval|truthfulqa:mc|0|0,lighteval|gsm8k|0|0" \ + --tasks "leaderboard|truthfulqa:mc|0|0,leaderboard|gsm8k|0|0" \ --override_batch_size 1 \ --output_dir="./evals/" ``` diff --git a/run_evals_accelerate.py b/run_evals_accelerate.py index ef0c473e5..57400f99b 100644 --- a/run_evals_accelerate.py +++ b/run_evals_accelerate.py @@ -22,7 +22,7 @@ """ Example run command: accelerate config -accelerate launch run_evals_accelerate.py --tasks="lighteval|hellaswag|5|1" --output_dir "/scratch/evals" --model_args "pretrained=gpt2" +accelerate launch run_evals_accelerate.py --tasks="leaderboard|hellaswag|5|1" --output_dir "/scratch/evals" --model_args "pretrained=gpt2" """ import argparse diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py index 265ab6fbe..391226a50 100644 --- a/src/lighteval/logging/evaluation_tracker.py +++ b/src/lighteval/logging/evaluation_tracker.py @@ -312,7 +312,7 @@ def recreate_metadata_card(self, repo_id: str, model_name: str = None) -> None: # in the iso date, the `:` are replaced by `-` because windows does not allow `:` in their filenames task_name = os.path.basename(sub_file).replace("details_", "").split("_2023")[0].split("_2024")[0] - # task_name is then equal to `lighteval|mmlu:us_foreign_policy|5` + # task_name is then equal to `leaderboard|mmlu:us_foreign_policy|5` iso_date = os.path.dirname(sub_file) # to be able to parse the filename as iso dates, we need to re-replace the `-` with `:` diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py index f302c0d41..3b0c338ed 100644 --- a/src/lighteval/tasks/registry.py +++ b/src/lighteval/tasks/registry.py @@ -35,9 +35,12 @@ from lighteval.tasks.lighteval_task import LightevalTask, LightevalTaskConfig -# original is the reimplementation of original evals -# custom is to play around -DEFAULT_SUITES = ["helm", "bigbench", "harness", "lighteval", "original", "custom", "community"] +# Helm, Bigbench, Harness are implementations following an evaluation suite setup +# Original follows the original implementation as closely as possible +# Leaderboard are the evaluations we fixed on the open llm leaderboard - you should get similar results +# Community are for community added evaluations +# Custom is for all the experiments you might want to do! +DEFAULT_SUITES = ["helm", "bigbench", "harness", "leaderboard", "lighteval", "original", "custom", "community"] TRUNCATE_FEW_SHOTS_DEFAULTS = True diff --git a/src/lighteval/tasks/tasks_table.jsonl b/src/lighteval/tasks/tasks_table.jsonl index 21c383fc3..98434c579 100644 --- a/src/lighteval/tasks/tasks_table.jsonl +++ b/src/lighteval/tasks/tasks_table.jsonl @@ -9,7 +9,7 @@ {"name":"arc:c:letters","suite":["original","arc"],"prompt_function":"arc_with_options_letters_predict","hf_repo":"ai2_arc","hf_subset":"ARC-Challenge","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"arc:c:options","suite":["original","arc"],"prompt_function":"arc_with_options","hf_repo":"ai2_arc","hf_subset":"ARC-Challenge","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"arc:c:simple","suite":["original","arc"],"prompt_function":"arc","hf_repo":"ai2_arc","hf_subset":"ARC-Challenge","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"arc:challenge","suite":["lighteval","arc"],"prompt_function":"arc","hf_repo":"ai2_arc","hf_subset":"ARC-Challenge","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling_from_train","generation_size":1,"metric":["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"arc:challenge","suite":["leaderboard","arc"],"prompt_function":"arc","hf_repo":"ai2_arc","hf_subset":"ARC-Challenge","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling_from_train","generation_size":1,"metric":["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"arc:easy","suite":["lighteval","arc"],"prompt_function":"arc","hf_repo":"ai2_arc","hf_subset":"ARC-Easy","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling_from_train","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"arithmetic:1dc","suite":["lighteval","arithmetic"],"prompt_function":"arithmetic","hf_repo":"EleutherAI\/arithmetic","hf_subset":"arithmetic_1dc","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"arithmetic:2da","suite":["lighteval","arithmetic"],"prompt_function":"arithmetic","hf_repo":"EleutherAI\/arithmetic","hf_subset":"arithmetic_2da","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} @@ -361,10 +361,11 @@ {"name":"goal_step_wikihow","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"goal_step_wikihow","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"gpqa","suite":["lighteval"],"prompt_function":"gpqa","hf_repo":"Idavidrein/gpqa","hf_subset":"gpqa_main","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"gre_reading_comprehension","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"gre_reading_comprehension","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"gsm8k","suite":["lighteval"],"prompt_function":"gsm8k","hf_repo":"gsm8k","hf_subset":"main","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling_from_train","generation_size":256,"metric":["quasi_exact_match_gsm8k"],"stop_sequence":[":","Question:", "Question"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"gsm8k","suite":["leaderboard"],"prompt_function":"gsm8k","hf_repo":"gsm8k","hf_subset":"main","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling_from_train","generation_size":256,"metric":["quasi_exact_match_gsm8k"],"stop_sequence":["Question:","Question",":"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"gsm8k","suite":["lighteval"],"prompt_function":"gsm8k","hf_repo":"gsm8k","hf_subset":"main","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling_from_train","generation_size":256,"metric":["quasi_exact_match_gsm8k"],"stop_sequence":["Question:"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"headqa:en","suite":["lighteval","headqa"],"prompt_function":"headqa","hf_repo":"lighteval/headqa_harness","hf_subset":"en","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"headqa:es","suite":["lighteval","headqa"],"prompt_function":"headqa","hf_repo":"lighteval/headqa_harness","hf_subset":"es","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"hellaswag","suite":["lighteval"],"prompt_function":"hellaswag_harness","hf_repo":"hellaswag","hf_subset":"default","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":"random_sampling_from_train","generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"hellaswag","suite":["leaderboard"],"prompt_function":"hellaswag_harness","hf_repo":"hellaswag","hf_subset":"default","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":"random_sampling_from_train","generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"hellaswag","suite":["helm","helm_general"],"prompt_function":"hellaswag_helm","hf_repo":"hellaswag","hf_subset":"default","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"hhh_alignment","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"hhh_alignment","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"hindi_question_answering","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"hindi_question_answering","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","rouge_t5","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} @@ -494,175 +495,175 @@ {"name":"mmlu","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"all","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu","suite":["original"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"all","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":5,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:abstract_algebra","suite":["original","mmlu"],"prompt_function":"mmlu_abstract_algebra","hf_repo":"cais\/mmlu","hf_subset":"abstract_algebra","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:abstract_algebra","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"abstract_algebra","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:abstract_algebra","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"abstract_algebra","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:abstract_algebra","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"abstract_algebra","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:anatomy","suite":["original","mmlu"],"prompt_function":"mmlu_anatomy","hf_repo":"cais\/mmlu","hf_subset":"anatomy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:anatomy","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"anatomy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:anatomy","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"anatomy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:anatomy","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"anatomy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:astronomy","suite":["original","mmlu"],"prompt_function":"mmlu_astronomy","hf_repo":"cais\/mmlu","hf_subset":"astronomy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:astronomy","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"astronomy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:astronomy","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"astronomy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:astronomy","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"astronomy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:business_ethics","suite":["original","mmlu"],"prompt_function":"mmlu_business_ethics","hf_repo":"cais\/mmlu","hf_subset":"business_ethics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:business_ethics","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"business_ethics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:business_ethics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"business_ethics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:business_ethics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"business_ethics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:clinical_knowledge","suite":["original","mmlu"],"prompt_function":"mmlu_clinical_knowledge","hf_repo":"cais\/mmlu","hf_subset":"clinical_knowledge","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:clinical_knowledge","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"clinical_knowledge","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:clinical_knowledge","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"clinical_knowledge","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:clinical_knowledge","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"clinical_knowledge","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:college_biology","suite":["original","mmlu"],"prompt_function":"mmlu_college_biology","hf_repo":"cais\/mmlu","hf_subset":"college_biology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:college_biology","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"college_biology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:college_biology","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"college_biology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:college_biology","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"college_biology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:college_chemistry","suite":["original","mmlu"],"prompt_function":"mmlu_college_chemistry","hf_repo":"cais\/mmlu","hf_subset":"college_chemistry","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:college_chemistry","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"college_chemistry","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:college_chemistry","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"college_chemistry","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:college_chemistry","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"college_chemistry","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:college_computer_science","suite":["original","mmlu"],"prompt_function":"mmlu_college_computer_science","hf_repo":"cais\/mmlu","hf_subset":"college_computer_science","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:college_computer_science","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"college_computer_science","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:college_computer_science","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"college_computer_science","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:college_computer_science","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"college_computer_science","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:college_mathematics","suite":["original","mmlu"],"prompt_function":"mmlu_college_mathematics","hf_repo":"cais\/mmlu","hf_subset":"college_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:college_mathematics","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"college_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:college_mathematics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"college_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:college_mathematics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"college_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:college_medicine","suite":["original","mmlu"],"prompt_function":"mmlu_college_medicine","hf_repo":"cais\/mmlu","hf_subset":"college_medicine","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:college_medicine","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"college_medicine","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:college_medicine","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"college_medicine","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:college_medicine","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"college_medicine","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:college_physics","suite":["original","mmlu"],"prompt_function":"mmlu_college_physics","hf_repo":"cais\/mmlu","hf_subset":"college_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:college_physics","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"college_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:college_physics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"college_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:college_physics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"college_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:computer_security","suite":["original","mmlu"],"prompt_function":"mmlu_computer_security","hf_repo":"cais\/mmlu","hf_subset":"computer_security","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:computer_security","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"computer_security","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:computer_security","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"computer_security","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:computer_security","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"computer_security","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:conceptual_physics","suite":["original","mmlu"],"prompt_function":"mmlu_conceptual_physics","hf_repo":"cais\/mmlu","hf_subset":"conceptual_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:conceptual_physics","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"conceptual_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:conceptual_physics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"conceptual_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:conceptual_physics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"conceptual_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:econometrics","suite":["original","mmlu"],"prompt_function":"mmlu_econometrics","hf_repo":"cais\/mmlu","hf_subset":"econometrics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:econometrics","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"econometrics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:econometrics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"econometrics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:econometrics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"econometrics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:electrical_engineering","suite":["original","mmlu"],"prompt_function":"mmlu_electrical_engineering","hf_repo":"cais\/mmlu","hf_subset":"electrical_engineering","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:electrical_engineering","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"electrical_engineering","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:electrical_engineering","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"electrical_engineering","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:electrical_engineering","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"electrical_engineering","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:elementary_mathematics","suite":["original","mmlu"],"prompt_function":"mmlu_elementary_mathematics","hf_repo":"cais\/mmlu","hf_subset":"elementary_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:elementary_mathematics","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"elementary_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:elementary_mathematics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"elementary_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:elementary_mathematics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"elementary_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:formal_logic","suite":["original","mmlu"],"prompt_function":"mmlu_formal_logic","hf_repo":"cais\/mmlu","hf_subset":"formal_logic","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:formal_logic","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"formal_logic","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:formal_logic","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"formal_logic","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:formal_logic","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"formal_logic","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:global_facts","suite":["original","mmlu"],"prompt_function":"mmlu_global_facts","hf_repo":"cais\/mmlu","hf_subset":"global_facts","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:global_facts","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"global_facts","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:global_facts","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"global_facts","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:global_facts","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"global_facts","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:high_school_biology","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_biology","hf_repo":"cais\/mmlu","hf_subset":"high_school_biology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_biology","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_biology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:high_school_biology","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_biology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:high_school_biology","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_biology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:high_school_chemistry","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_chemistry","hf_repo":"cais\/mmlu","hf_subset":"high_school_chemistry","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_chemistry","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_chemistry","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:high_school_chemistry","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_chemistry","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:high_school_chemistry","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_chemistry","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:high_school_computer_science","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_computer_science","hf_repo":"cais\/mmlu","hf_subset":"high_school_computer_science","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_computer_science","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_computer_science","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:high_school_computer_science","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_computer_science","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:high_school_computer_science","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_computer_science","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:high_school_european_history","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_european_history","hf_repo":"cais\/mmlu","hf_subset":"high_school_european_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_european_history","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_european_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:high_school_european_history","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_european_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:high_school_european_history","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_european_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:high_school_geography","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_geography","hf_repo":"cais\/mmlu","hf_subset":"high_school_geography","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_geography","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_geography","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:high_school_geography","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_geography","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:high_school_geography","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_geography","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:high_school_government_and_politics","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_government_and_politics","hf_repo":"cais\/mmlu","hf_subset":"high_school_government_and_politics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_government_and_politics","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_government_and_politics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:high_school_government_and_politics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_government_and_politics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:high_school_government_and_politics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_government_and_politics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:high_school_macroeconomics","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_macroeconomics","hf_repo":"cais\/mmlu","hf_subset":"high_school_macroeconomics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_macroeconomics","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_macroeconomics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:high_school_macroeconomics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_macroeconomics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:high_school_macroeconomics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_macroeconomics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:high_school_mathematics","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_mathematics","hf_repo":"cais\/mmlu","hf_subset":"high_school_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_mathematics","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:high_school_mathematics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:high_school_mathematics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:high_school_microeconomics","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_microeconomics","hf_repo":"cais\/mmlu","hf_subset":"high_school_microeconomics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_microeconomics","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_microeconomics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:high_school_microeconomics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_microeconomics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:high_school_microeconomics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_microeconomics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:high_school_physics","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_physics","hf_repo":"cais\/mmlu","hf_subset":"high_school_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_physics","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:high_school_physics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:high_school_physics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:high_school_psychology","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_psychology","hf_repo":"cais\/mmlu","hf_subset":"high_school_psychology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_psychology","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_psychology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:high_school_psychology","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_psychology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:high_school_psychology","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_psychology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:high_school_statistics","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_statistics","hf_repo":"cais\/mmlu","hf_subset":"high_school_statistics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_statistics","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_statistics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:high_school_statistics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_statistics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:high_school_statistics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_statistics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:high_school_us_history","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_us_history","hf_repo":"cais\/mmlu","hf_subset":"high_school_us_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_us_history","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_us_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:high_school_us_history","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_us_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:high_school_us_history","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_us_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:high_school_world_history","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_world_history","hf_repo":"cais\/mmlu","hf_subset":"high_school_world_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:high_school_world_history","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_world_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:high_school_world_history","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_world_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:high_school_world_history","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_world_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:human_aging","suite":["original","mmlu"],"prompt_function":"mmlu_human_aging","hf_repo":"cais\/mmlu","hf_subset":"human_aging","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:human_aging","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"human_aging","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:human_aging","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"human_aging","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:human_aging","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"human_aging","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:human_sexuality","suite":["original","mmlu"],"prompt_function":"mmlu_human_sexuality","hf_repo":"cais\/mmlu","hf_subset":"human_sexuality","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:human_sexuality","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"human_sexuality","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:human_sexuality","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"human_sexuality","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:human_sexuality","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"human_sexuality","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:international_law","suite":["original","mmlu"],"prompt_function":"mmlu_international_law","hf_repo":"cais\/mmlu","hf_subset":"international_law","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:international_law","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"international_law","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:international_law","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"international_law","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:international_law","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"international_law","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:jurisprudence","suite":["original","mmlu"],"prompt_function":"mmlu_jurisprudence","hf_repo":"cais\/mmlu","hf_subset":"jurisprudence","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:jurisprudence","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"jurisprudence","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:jurisprudence","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"jurisprudence","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:jurisprudence","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"jurisprudence","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:logical_fallacies","suite":["original","mmlu"],"prompt_function":"mmlu_logical_fallacies","hf_repo":"cais\/mmlu","hf_subset":"logical_fallacies","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:logical_fallacies","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"logical_fallacies","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:logical_fallacies","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"logical_fallacies","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:logical_fallacies","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"logical_fallacies","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:machine_learning","suite":["original","mmlu"],"prompt_function":"mmlu_machine_learning","hf_repo":"cais\/mmlu","hf_subset":"machine_learning","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:machine_learning","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"machine_learning","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:machine_learning","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"machine_learning","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:machine_learning","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"machine_learning","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:management","suite":["original","mmlu"],"prompt_function":"mmlu_management","hf_repo":"cais\/mmlu","hf_subset":"management","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:management","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"management","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:management","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"management","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:management","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"management","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:marketing","suite":["original","mmlu"],"prompt_function":"mmlu_marketing","hf_repo":"cais\/mmlu","hf_subset":"marketing","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:marketing","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"marketing","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:marketing","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"marketing","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:marketing","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"marketing","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:medical_genetics","suite":["original","mmlu"],"prompt_function":"mmlu_medical_genetics","hf_repo":"cais\/mmlu","hf_subset":"medical_genetics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:medical_genetics","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"medical_genetics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:medical_genetics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"medical_genetics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:medical_genetics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"medical_genetics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:miscellaneous","suite":["original","mmlu"],"prompt_function":"mmlu_miscellaneous","hf_repo":"cais\/mmlu","hf_subset":"miscellaneous","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:miscellaneous","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"miscellaneous","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:miscellaneous","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"miscellaneous","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:miscellaneous","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"miscellaneous","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:moral_disputes","suite":["original","mmlu"],"prompt_function":"mmlu_moral_disputes","hf_repo":"cais\/mmlu","hf_subset":"moral_disputes","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:moral_disputes","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"moral_disputes","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:moral_disputes","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"moral_disputes","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:moral_disputes","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"moral_disputes","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:moral_scenarios","suite":["original","mmlu"],"prompt_function":"mmlu_moral_scenarios","hf_repo":"cais\/mmlu","hf_subset":"moral_scenarios","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:moral_scenarios","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"moral_scenarios","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:moral_scenarios","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"moral_scenarios","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:moral_scenarios","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"moral_scenarios","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:nutrition","suite":["original","mmlu"],"prompt_function":"mmlu_nutrition","hf_repo":"cais\/mmlu","hf_subset":"nutrition","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:nutrition","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"nutrition","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:nutrition","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"nutrition","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:nutrition","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"nutrition","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:philosophy","suite":["original","mmlu"],"prompt_function":"mmlu_philosophy","hf_repo":"cais\/mmlu","hf_subset":"philosophy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:philosophy","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"philosophy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:philosophy","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"philosophy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:philosophy","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"philosophy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:prehistory","suite":["original","mmlu"],"prompt_function":"mmlu_prehistory","hf_repo":"cais\/mmlu","hf_subset":"prehistory","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:prehistory","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"prehistory","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:prehistory","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"prehistory","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:prehistory","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"prehistory","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:professional_accounting","suite":["original","mmlu"],"prompt_function":"mmlu_professional_accounting","hf_repo":"cais\/mmlu","hf_subset":"professional_accounting","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:professional_accounting","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"professional_accounting","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:professional_accounting","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"professional_accounting","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:professional_accounting","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"professional_accounting","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:professional_law","suite":["original","mmlu"],"prompt_function":"mmlu_professional_law","hf_repo":"cais\/mmlu","hf_subset":"professional_law","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:professional_law","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"professional_law","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:professional_law","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"professional_law","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:professional_law","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"professional_law","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:professional_medicine","suite":["original","mmlu"],"prompt_function":"mmlu_professional_medicine","hf_repo":"cais\/mmlu","hf_subset":"professional_medicine","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:professional_medicine","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"professional_medicine","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:professional_medicine","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"professional_medicine","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:professional_medicine","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"professional_medicine","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:professional_psychology","suite":["original","mmlu"],"prompt_function":"mmlu_professional_psychology","hf_repo":"cais\/mmlu","hf_subset":"professional_psychology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:professional_psychology","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"professional_psychology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:professional_psychology","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"professional_psychology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:professional_psychology","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"professional_psychology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:public_relations","suite":["original","mmlu"],"prompt_function":"mmlu_public_relations","hf_repo":"cais\/mmlu","hf_subset":"public_relations","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:public_relations","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"public_relations","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:public_relations","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"public_relations","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:public_relations","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"public_relations","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:security_studies","suite":["original","mmlu"],"prompt_function":"mmlu_security_studies","hf_repo":"cais\/mmlu","hf_subset":"security_studies","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:security_studies","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"security_studies","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:security_studies","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"security_studies","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:security_studies","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"security_studies","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:sociology","suite":["original","mmlu"],"prompt_function":"mmlu_sociology","hf_repo":"cais\/mmlu","hf_subset":"sociology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:sociology","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"sociology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:sociology","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"sociology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:sociology","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"sociology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:us_foreign_policy","suite":["original","mmlu"],"prompt_function":"mmlu_us_foreign_policy","hf_repo":"cais\/mmlu","hf_subset":"us_foreign_policy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:us_foreign_policy","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"us_foreign_policy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:us_foreign_policy","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"us_foreign_policy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:us_foreign_policy","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"us_foreign_policy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:virology","suite":["original","mmlu"],"prompt_function":"mmlu_virology","hf_repo":"cais\/mmlu","hf_subset":"virology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:virology","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"virology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:virology","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"virology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:virology","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"virology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:world_religions","suite":["original","mmlu"],"prompt_function":"mmlu_world_religions","hf_repo":"cais\/mmlu","hf_subset":"world_religions","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"mmlu:world_religions","suite":["lighteval","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"world_religions","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"mmlu:world_religions","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"world_religions","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mmlu:world_religions","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"world_religions","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"mnist_ascii","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"mnist_ascii","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"modified_arithmetic","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"modified_arithmetic","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} @@ -835,7 +836,7 @@ {"name":"tracking_shuffled_objects","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"tracking_shuffled_objects","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"triviaqa","suite":["lighteval"],"prompt_function":"triviaqa","hf_repo":"trivia_qa","hf_subset":"rc.nocontext","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["quasi_exact_match_triviaqa"],"stop_sequence":["\n", ".", ","],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"truthfulqa:gen","suite":["lighteval"],"prompt_function":"truthful_qa_generative","hf_repo":"truthful_qa","hf_subset":"generation","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":200,"metric":["bleu","rouge_t5"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"truthfulqa:mc","suite":["lighteval"],"prompt_function":"truthful_qa_multiple_choice","hf_repo":"truthful_qa","hf_subset":"multiple_choice","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["truthfulqa_mc_metrics"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"truthfulqa:mc","suite":["leaderboard"],"prompt_function":"truthful_qa_multiple_choice","hf_repo":"truthful_qa","hf_subset":"multiple_choice","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["truthfulqa_mc_metrics"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"truthfulqa","suite":["helm","helm_general"],"prompt_function":"truthful_qa_helm","hf_repo":"lighteval\/truthfulqa_helm","hf_subset":"default","hf_avail_splits":["train","valid"],"evaluation_splits":["valid"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["loglikelihood_acc","exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"twitterAAE:aa","suite":["helm"],"prompt_function":"twitter_aae","hf_repo":"lighteval\/twitterAAE","hf_subset":"aa","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"twitterAAE:white","suite":["helm"],"prompt_function":"twitter_aae","hf_repo":"lighteval\/twitterAAE","hf_subset":"white","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} @@ -943,7 +944,7 @@ {"name":"wikitext:103:document_level","suite":["harness"],"prompt_function":"wikitext_harness","hf_repo":"EleutherAI\/wikitext_document_level","hf_subset":"wikitext-103-raw-v1","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"wikitext:103:document_level","suite":["helm"],"prompt_function":"wikitext_helm","hf_repo":"EleutherAI\/wikitext_document_level","hf_subset":"wikitext-103-raw-v1","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"wino_x_german","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"wino_x_german","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} -{"name":"winogrande","suite":["lighteval"],"prompt_function":"winogrande","hf_repo":"winogrande","hf_subset":"winogrande_xl","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} +{"name":"winogrande","suite":["leaderboard"],"prompt_function":"winogrande","hf_repo":"winogrande","hf_subset":"winogrande_xl","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"winowhy","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_whitespace_after_query","hf_repo":"bigbench","hf_subset":"winowhy","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"wmt08:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt08_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} {"name":"wmt08:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt08_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true} diff --git a/tasks_examples/all_tasks.txt b/tasks_examples/all_tasks.txt index 69329daab..894ffc6f9 100644 --- a/tasks_examples/all_tasks.txt +++ b/tasks_examples/all_tasks.txt @@ -628,7 +628,7 @@ lighteval|anli:r1|0|0 lighteval|anli:r2|0|0 lighteval|anli:r3|0|0 lighteval|anli|0|0 -lighteval|arc:challenge|0|0 +leaderboard|arc:challenge|0|0 lighteval|arc:easy|0|0 lighteval|arithmetic:1dc|0|0 lighteval|arithmetic:2da|0|0 @@ -726,10 +726,10 @@ lighteval|glue:rte|0|0 lighteval|glue:sst2|0|0 lighteval|glue:stsb|0|0 lighteval|glue:wnli|0|0 -lighteval|gsm8k|0|0 +leaderboard|gsm8k|0|0 lighteval|headqa:en|0|0 lighteval|headqa:es|0|0 -lighteval|hellaswag|0|0 +leaderboard|hellaswag|0|0 lighteval|iwslt17:ar-en|0|0 lighteval|iwslt17:de-en|0|0 lighteval|iwslt17:en-ar|0|0 @@ -771,63 +771,63 @@ lighteval|mgsm:sw|0|0 lighteval|mgsm:te|0|0 lighteval|mgsm:th|0|0 lighteval|mgsm:zh|0|0 -lighteval|mmlu:abstract_algebra|0|0 -lighteval|mmlu:anatomy|0|0 -lighteval|mmlu:astronomy|0|0 -lighteval|mmlu:business_ethics|0|0 -lighteval|mmlu:clinical_knowledge|0|0 -lighteval|mmlu:college_biology|0|0 -lighteval|mmlu:college_chemistry|0|0 -lighteval|mmlu:college_computer_science|0|0 -lighteval|mmlu:college_mathematics|0|0 -lighteval|mmlu:college_medicine|0|0 -lighteval|mmlu:college_physics|0|0 -lighteval|mmlu:computer_security|0|0 -lighteval|mmlu:conceptual_physics|0|0 -lighteval|mmlu:econometrics|0|0 -lighteval|mmlu:electrical_engineering|0|0 -lighteval|mmlu:elementary_mathematics|0|0 -lighteval|mmlu:formal_logic|0|0 -lighteval|mmlu:global_facts|0|0 -lighteval|mmlu:high_school_biology|0|0 -lighteval|mmlu:high_school_chemistry|0|0 -lighteval|mmlu:high_school_computer_science|0|0 -lighteval|mmlu:high_school_european_history|0|0 -lighteval|mmlu:high_school_geography|0|0 -lighteval|mmlu:high_school_government_and_politics|0|0 -lighteval|mmlu:high_school_macroeconomics|0|0 -lighteval|mmlu:high_school_mathematics|0|0 -lighteval|mmlu:high_school_microeconomics|0|0 -lighteval|mmlu:high_school_physics|0|0 -lighteval|mmlu:high_school_psychology|0|0 -lighteval|mmlu:high_school_statistics|0|0 -lighteval|mmlu:high_school_us_history|0|0 -lighteval|mmlu:high_school_world_history|0|0 -lighteval|mmlu:human_aging|0|0 -lighteval|mmlu:human_sexuality|0|0 -lighteval|mmlu:international_law|0|0 -lighteval|mmlu:jurisprudence|0|0 -lighteval|mmlu:logical_fallacies|0|0 -lighteval|mmlu:machine_learning|0|0 -lighteval|mmlu:management|0|0 -lighteval|mmlu:marketing|0|0 -lighteval|mmlu:medical_genetics|0|0 -lighteval|mmlu:miscellaneous|0|0 -lighteval|mmlu:moral_disputes|0|0 -lighteval|mmlu:moral_scenarios|0|0 -lighteval|mmlu:nutrition|0|0 -lighteval|mmlu:philosophy|0|0 -lighteval|mmlu:prehistory|0|0 -lighteval|mmlu:professional_accounting|0|0 -lighteval|mmlu:professional_law|0|0 -lighteval|mmlu:professional_medicine|0|0 -lighteval|mmlu:professional_psychology|0|0 -lighteval|mmlu:public_relations|0|0 -lighteval|mmlu:security_studies|0|0 -lighteval|mmlu:sociology|0|0 -lighteval|mmlu:us_foreign_policy|0|0 -lighteval|mmlu:virology|0|0 -lighteval|mmlu:world_religions|0|0 +leaderboard|mmlu:abstract_algebra|0|0 +leaderboard|mmlu:anatomy|0|0 +leaderboard|mmlu:astronomy|0|0 +leaderboard|mmlu:business_ethics|0|0 +leaderboard|mmlu:clinical_knowledge|0|0 +leaderboard|mmlu:college_biology|0|0 +leaderboard|mmlu:college_chemistry|0|0 +leaderboard|mmlu:college_computer_science|0|0 +leaderboard|mmlu:college_mathematics|0|0 +leaderboard|mmlu:college_medicine|0|0 +leaderboard|mmlu:college_physics|0|0 +leaderboard|mmlu:computer_security|0|0 +leaderboard|mmlu:conceptual_physics|0|0 +leaderboard|mmlu:econometrics|0|0 +leaderboard|mmlu:electrical_engineering|0|0 +leaderboard|mmlu:elementary_mathematics|0|0 +leaderboard|mmlu:formal_logic|0|0 +leaderboard|mmlu:global_facts|0|0 +leaderboard|mmlu:high_school_biology|0|0 +leaderboard|mmlu:high_school_chemistry|0|0 +leaderboard|mmlu:high_school_computer_science|0|0 +leaderboard|mmlu:high_school_european_history|0|0 +leaderboard|mmlu:high_school_geography|0|0 +leaderboard|mmlu:high_school_government_and_politics|0|0 +leaderboard|mmlu:high_school_macroeconomics|0|0 +leaderboard|mmlu:high_school_mathematics|0|0 +leaderboard|mmlu:high_school_microeconomics|0|0 +leaderboard|mmlu:high_school_physics|0|0 +leaderboard|mmlu:high_school_psychology|0|0 +leaderboard|mmlu:high_school_statistics|0|0 +leaderboard|mmlu:high_school_us_history|0|0 +leaderboard|mmlu:high_school_world_history|0|0 +leaderboard|mmlu:human_aging|0|0 +leaderboard|mmlu:human_sexuality|0|0 +leaderboard|mmlu:international_law|0|0 +leaderboard|mmlu:jurisprudence|0|0 +leaderboard|mmlu:logical_fallacies|0|0 +leaderboard|mmlu:machine_learning|0|0 +leaderboard|mmlu:management|0|0 +leaderboard|mmlu:marketing|0|0 +leaderboard|mmlu:medical_genetics|0|0 +leaderboard|mmlu:miscellaneous|0|0 +leaderboard|mmlu:moral_disputes|0|0 +leaderboard|mmlu:moral_scenarios|0|0 +leaderboard|mmlu:nutrition|0|0 +leaderboard|mmlu:philosophy|0|0 +leaderboard|mmlu:prehistory|0|0 +leaderboard|mmlu:professional_accounting|0|0 +leaderboard|mmlu:professional_law|0|0 +leaderboard|mmlu:professional_medicine|0|0 +leaderboard|mmlu:professional_psychology|0|0 +leaderboard|mmlu:public_relations|0|0 +leaderboard|mmlu:security_studies|0|0 +leaderboard|mmlu:sociology|0|0 +leaderboard|mmlu:us_foreign_policy|0|0 +leaderboard|mmlu:virology|0|0 +leaderboard|mmlu:world_religions|0|0 lighteval|mtnt2019:en-fr|0|0 lighteval|mtnt2019:en-ja|0|0 lighteval|mtnt2019:fr-en|0|0 @@ -881,7 +881,7 @@ lighteval|the_pile:youtubesubtitles|0|0 lighteval|toxigen|0|0 lighteval|triviaqa|0|0 lighteval|truthfulqa:gen|0|0 -lighteval|truthfulqa:mc|0|0 +leaderboard|truthfulqa:mc|0|0 lighteval|unscramble:anagrams1|0|0 lighteval|unscramble:anagrams2|0|0 lighteval|unscramble:cycle_letters|0|0 @@ -889,7 +889,7 @@ lighteval|unscramble:random_insertion|0|0 lighteval|unscramble:reversed_words|0|0 lighteval|webqs|0|0 lighteval|wikitext|0|0 -lighteval|winogrande|0|0 +leaderboard|winogrande|0|0 lighteval|wmt08:cs-en|0|0 lighteval|wmt08:de-en|0|0 lighteval|wmt08:en-cs|0|0 diff --git a/tasks_examples/open_llm_leaderboard_tasks.txt b/tasks_examples/open_llm_leaderboard_tasks.txt index 451d87c45..51de4f473 100644 --- a/tasks_examples/open_llm_leaderboard_tasks.txt +++ b/tasks_examples/open_llm_leaderboard_tasks.txt @@ -1,68 +1,68 @@ # ARC -lighteval|arc:challenge|25|0 +leaderboard|arc:challenge|25|0 # HellaSwag -lighteval|hellaswag|10|0 +leaderboard|hellaswag|10|0 # TruthfulQA -lighteval|truthfulqa:mc|0|0 +leaderboard|truthfulqa:mc|0|0 # MMLU -lighteval|mmlu:abstract_algebra|5|0 -lighteval|mmlu:anatomy|5|0 -lighteval|mmlu:astronomy|5|0 -lighteval|mmlu:business_ethics|5|0 -lighteval|mmlu:clinical_knowledge|5|0 -lighteval|mmlu:college_biology|5|0 -lighteval|mmlu:college_chemistry|5|0 -lighteval|mmlu:college_computer_science|5|0 -lighteval|mmlu:college_mathematics|5|0 -lighteval|mmlu:college_medicine|5|0 -lighteval|mmlu:college_physics|5|0 -lighteval|mmlu:computer_security|5|0 -lighteval|mmlu:conceptual_physics|5|0 -lighteval|mmlu:econometrics|5|0 -lighteval|mmlu:electrical_engineering|5|0 -lighteval|mmlu:elementary_mathematics|5|0 -lighteval|mmlu:formal_logic|5|0 -lighteval|mmlu:global_facts|5|0 -lighteval|mmlu:high_school_biology|5|0 -lighteval|mmlu:high_school_chemistry|5|0 -lighteval|mmlu:high_school_computer_science|5|0 -lighteval|mmlu:high_school_european_history|5|0 -lighteval|mmlu:high_school_geography|5|0 -lighteval|mmlu:high_school_government_and_politics|5|0 -lighteval|mmlu:high_school_macroeconomics|5|0 -lighteval|mmlu:high_school_mathematics|5|0 -lighteval|mmlu:high_school_microeconomics|5|0 -lighteval|mmlu:high_school_physics|5|0 -lighteval|mmlu:high_school_psychology|5|0 -lighteval|mmlu:high_school_statistics|5|0 -lighteval|mmlu:high_school_us_history|5|0 -lighteval|mmlu:high_school_world_history|5|0 -lighteval|mmlu:human_aging|5|0 -lighteval|mmlu:human_sexuality|5|0 -lighteval|mmlu:international_law|5|0 -lighteval|mmlu:jurisprudence|5|0 -lighteval|mmlu:logical_fallacies|5|0 -lighteval|mmlu:machine_learning|5|0 -lighteval|mmlu:management|5|0 -lighteval|mmlu:marketing|5|0 -lighteval|mmlu:medical_genetics|5|0 -lighteval|mmlu:miscellaneous|5|0 -lighteval|mmlu:moral_disputes|5|0 -lighteval|mmlu:moral_scenarios|5|0 -lighteval|mmlu:nutrition|5|0 -lighteval|mmlu:philosophy|5|0 -lighteval|mmlu:prehistory|5|0 -lighteval|mmlu:professional_accounting|5|0 -lighteval|mmlu:professional_law|5|0 -lighteval|mmlu:professional_medicine|5|0 -lighteval|mmlu:professional_psychology|5|0 -lighteval|mmlu:public_relations|5|0 -lighteval|mmlu:security_studies|5|0 -lighteval|mmlu:sociology|5|0 -lighteval|mmlu:us_foreign_policy|5|0 -lighteval|mmlu:virology|5|0 -lighteval|mmlu:world_religions|5|0 +leaderboard|mmlu:abstract_algebra|5|0 +leaderboard|mmlu:anatomy|5|0 +leaderboard|mmlu:astronomy|5|0 +leaderboard|mmlu:business_ethics|5|0 +leaderboard|mmlu:clinical_knowledge|5|0 +leaderboard|mmlu:college_biology|5|0 +leaderboard|mmlu:college_chemistry|5|0 +leaderboard|mmlu:college_computer_science|5|0 +leaderboard|mmlu:college_mathematics|5|0 +leaderboard|mmlu:college_medicine|5|0 +leaderboard|mmlu:college_physics|5|0 +leaderboard|mmlu:computer_security|5|0 +leaderboard|mmlu:conceptual_physics|5|0 +leaderboard|mmlu:econometrics|5|0 +leaderboard|mmlu:electrical_engineering|5|0 +leaderboard|mmlu:elementary_mathematics|5|0 +leaderboard|mmlu:formal_logic|5|0 +leaderboard|mmlu:global_facts|5|0 +leaderboard|mmlu:high_school_biology|5|0 +leaderboard|mmlu:high_school_chemistry|5|0 +leaderboard|mmlu:high_school_computer_science|5|0 +leaderboard|mmlu:high_school_european_history|5|0 +leaderboard|mmlu:high_school_geography|5|0 +leaderboard|mmlu:high_school_government_and_politics|5|0 +leaderboard|mmlu:high_school_macroeconomics|5|0 +leaderboard|mmlu:high_school_mathematics|5|0 +leaderboard|mmlu:high_school_microeconomics|5|0 +leaderboard|mmlu:high_school_physics|5|0 +leaderboard|mmlu:high_school_psychology|5|0 +leaderboard|mmlu:high_school_statistics|5|0 +leaderboard|mmlu:high_school_us_history|5|0 +leaderboard|mmlu:high_school_world_history|5|0 +leaderboard|mmlu:human_aging|5|0 +leaderboard|mmlu:human_sexuality|5|0 +leaderboard|mmlu:international_law|5|0 +leaderboard|mmlu:jurisprudence|5|0 +leaderboard|mmlu:logical_fallacies|5|0 +leaderboard|mmlu:machine_learning|5|0 +leaderboard|mmlu:management|5|0 +leaderboard|mmlu:marketing|5|0 +leaderboard|mmlu:medical_genetics|5|0 +leaderboard|mmlu:miscellaneous|5|0 +leaderboard|mmlu:moral_disputes|5|0 +leaderboard|mmlu:moral_scenarios|5|0 +leaderboard|mmlu:nutrition|5|0 +leaderboard|mmlu:philosophy|5|0 +leaderboard|mmlu:prehistory|5|0 +leaderboard|mmlu:professional_accounting|5|0 +leaderboard|mmlu:professional_law|5|0 +leaderboard|mmlu:professional_medicine|5|0 +leaderboard|mmlu:professional_psychology|5|0 +leaderboard|mmlu:public_relations|5|0 +leaderboard|mmlu:security_studies|5|0 +leaderboard|mmlu:sociology|5|0 +leaderboard|mmlu:us_foreign_policy|5|0 +leaderboard|mmlu:virology|5|0 +leaderboard|mmlu:world_religions|5|0 # WinoGrande -lighteval|winogrande|5|0 +leaderboard|winogrande|5|0 # GSM8K -lighteval|gsm8k|5|0 \ No newline at end of file +leaderboard|gsm8k|5|0 diff --git a/tasks_examples/recommended_set.txt b/tasks_examples/recommended_set.txt index b2a9d7bca..d1904e3cc 100644 --- a/tasks_examples/recommended_set.txt +++ b/tasks_examples/recommended_set.txt @@ -6,63 +6,63 @@ lighteval|ethics:justice|0|0 lighteval|ethics:utilitarianism|0|0 lighteval|ethics:virtue|0|0 # MMLU -lighteval|mmlu:abstract_algebra|0|0 -lighteval|mmlu:anatomy|0|0 -lighteval|mmlu:astronomy|0|0 -lighteval|mmlu:business_ethics|0|0 -lighteval|mmlu:clinical_knowledge|0|0 -lighteval|mmlu:college_biology|0|0 -lighteval|mmlu:college_chemistry|0|0 -lighteval|mmlu:college_computer_science|0|0 -lighteval|mmlu:college_mathematics|0|0 -lighteval|mmlu:college_medicine|0|0 -lighteval|mmlu:college_physics|0|0 -lighteval|mmlu:computer_security|0|0 -lighteval|mmlu:conceptual_physics|0|0 -lighteval|mmlu:econometrics|0|0 -lighteval|mmlu:electrical_engineering|0|0 -lighteval|mmlu:elementary_mathematics|0|0 -lighteval|mmlu:formal_logic|0|0 -lighteval|mmlu:global_facts|0|0 -lighteval|mmlu:high_school_biology|0|0 -lighteval|mmlu:high_school_chemistry|0|0 -lighteval|mmlu:high_school_computer_science|0|0 -lighteval|mmlu:high_school_european_history|0|0 -lighteval|mmlu:high_school_geography|0|0 -lighteval|mmlu:high_school_government_and_politics|0|0 -lighteval|mmlu:high_school_macroeconomics|0|0 -lighteval|mmlu:high_school_mathematics|0|0 -lighteval|mmlu:high_school_microeconomics|0|0 -lighteval|mmlu:high_school_physics|0|0 -lighteval|mmlu:high_school_psychology|0|0 -lighteval|mmlu:high_school_statistics|0|0 -lighteval|mmlu:high_school_us_history|0|0 -lighteval|mmlu:high_school_world_history|0|0 -lighteval|mmlu:human_aging|0|0 -lighteval|mmlu:human_sexuality|0|0 -lighteval|mmlu:international_law|0|0 -lighteval|mmlu:jurisprudence|0|0 -lighteval|mmlu:logical_fallacies|0|0 -lighteval|mmlu:machine_learning|0|0 -lighteval|mmlu:management|0|0 -lighteval|mmlu:marketing|0|0 -lighteval|mmlu:medical_genetics|0|0 -lighteval|mmlu:miscellaneous|0|0 -lighteval|mmlu:moral_disputes|0|0 -lighteval|mmlu:moral_scenarios|0|0 -lighteval|mmlu:nutrition|0|0 -lighteval|mmlu:philosophy|0|0 -lighteval|mmlu:prehistory|0|0 -lighteval|mmlu:professional_accounting|0|0 -lighteval|mmlu:professional_law|0|0 -lighteval|mmlu:professional_medicine|0|0 -lighteval|mmlu:professional_psychology|0|0 -lighteval|mmlu:public_relations|0|0 -lighteval|mmlu:security_studies|0|0 -lighteval|mmlu:sociology|0|0 -lighteval|mmlu:us_foreign_policy|0|0 -lighteval|mmlu:virology|0|0 -lighteval|mmlu:world_religions|0|0 +leaderboard|mmlu:abstract_algebra|0|0 +leaderboard|mmlu:anatomy|0|0 +leaderboard|mmlu:astronomy|0|0 +leaderboard|mmlu:business_ethics|0|0 +leaderboard|mmlu:clinical_knowledge|0|0 +leaderboard|mmlu:college_biology|0|0 +leaderboard|mmlu:college_chemistry|0|0 +leaderboard|mmlu:college_computer_science|0|0 +leaderboard|mmlu:college_mathematics|0|0 +leaderboard|mmlu:college_medicine|0|0 +leaderboard|mmlu:college_physics|0|0 +leaderboard|mmlu:computer_security|0|0 +leaderboard|mmlu:conceptual_physics|0|0 +leaderboard|mmlu:econometrics|0|0 +leaderboard|mmlu:electrical_engineering|0|0 +leaderboard|mmlu:elementary_mathematics|0|0 +leaderboard|mmlu:formal_logic|0|0 +leaderboard|mmlu:global_facts|0|0 +leaderboard|mmlu:high_school_biology|0|0 +leaderboard|mmlu:high_school_chemistry|0|0 +leaderboard|mmlu:high_school_computer_science|0|0 +leaderboard|mmlu:high_school_european_history|0|0 +leaderboard|mmlu:high_school_geography|0|0 +leaderboard|mmlu:high_school_government_and_politics|0|0 +leaderboard|mmlu:high_school_macroeconomics|0|0 +leaderboard|mmlu:high_school_mathematics|0|0 +leaderboard|mmlu:high_school_microeconomics|0|0 +leaderboard|mmlu:high_school_physics|0|0 +leaderboard|mmlu:high_school_psychology|0|0 +leaderboard|mmlu:high_school_statistics|0|0 +leaderboard|mmlu:high_school_us_history|0|0 +leaderboard|mmlu:high_school_world_history|0|0 +leaderboard|mmlu:human_aging|0|0 +leaderboard|mmlu:human_sexuality|0|0 +leaderboard|mmlu:international_law|0|0 +leaderboard|mmlu:jurisprudence|0|0 +leaderboard|mmlu:logical_fallacies|0|0 +leaderboard|mmlu:machine_learning|0|0 +leaderboard|mmlu:management|0|0 +leaderboard|mmlu:marketing|0|0 +leaderboard|mmlu:medical_genetics|0|0 +leaderboard|mmlu:miscellaneous|0|0 +leaderboard|mmlu:moral_disputes|0|0 +leaderboard|mmlu:moral_scenarios|0|0 +leaderboard|mmlu:nutrition|0|0 +leaderboard|mmlu:philosophy|0|0 +leaderboard|mmlu:prehistory|0|0 +leaderboard|mmlu:professional_accounting|0|0 +leaderboard|mmlu:professional_law|0|0 +leaderboard|mmlu:professional_medicine|0|0 +leaderboard|mmlu:professional_psychology|0|0 +leaderboard|mmlu:public_relations|0|0 +leaderboard|mmlu:security_studies|0|0 +leaderboard|mmlu:sociology|0|0 +leaderboard|mmlu:us_foreign_policy|0|0 +leaderboard|mmlu:virology|0|0 +leaderboard|mmlu:world_religions|0|0 original|mmlu:abstract_algebra|0|0 original|mmlu:anatomy|0|0 original|mmlu:astronomy|0|0 @@ -122,21 +122,21 @@ original|mmlu:virology|0|0 original|mmlu:world_religions|0|0 original|mmlu|0|0 # ARC -lighteval|arc:challenge|0|0 +leaderboard|arc:challenge|0|0 lighteval|arc:easy|0|0 original|arc:c:letters|0|0 original|arc:c:options|0|0 original|arc:c:simple|0|0 # HellaSwag helm|hellaswag|0|0 -lighteval|hellaswag|0|0 +leaderboard|hellaswag|0|0 # PIQA helm|piqa|0|0 lighteval|piqa|0|0 # SIQA helm|siqa|0|0 # WinoGrande -lighteval|winogrande|0|0 +leaderboard|winogrande|0|0 # OpenBookQA lighteval|openbookqa|0|0 helm|openbookqa|0|0 @@ -148,7 +148,7 @@ helm|boolq|0|0 # QUAC helm|quac|0|0 # GSM8K -lighteval|gsm8k|0|0 +leaderboard|gsm8k|0|0 # MATH lighteval|math:algebra|0|0 lighteval|math:counting_and_probability|0|0 diff --git a/tests/reference_scores/reference_task_scores.py b/tests/reference_scores/reference_task_scores.py index ad265c96e..f901a4767 100644 --- a/tests/reference_scores/reference_task_scores.py +++ b/tests/reference_scores/reference_task_scores.py @@ -26,35 +26,35 @@ "lighteval|anli:r1|0|0": {"acc": 0.337, "acc_stderr": 0.014955087918653605}, "lighteval|blimp:adjunct_island|0|0": {"acc": 0.893, "acc_stderr": 0.009779910359847165}, "lighteval|blimp:ellipsis_n_bar_1|0|0": {"acc": 0.909, "acc_stderr": 0.009099549538400246}, - "lighteval|arc:challenge|25|0": { + "leaderboard|arc:challenge|25|0": { "acc": 0.257679180887372, "acc_stderr": 0.0127807705627684, "acc_norm": 0.302901023890785, "acc_norm_stderr": 0.013428241573185347, }, - "lighteval|hellaswag|10|0": { + "leaderboard|hellaswag|10|0": { "acc": 0.3981278629755029, "acc_stderr": 0.004885116465550274, "acc_norm": 0.5139414459271061, "acc_norm_stderr": 0.004987841367402517, }, - "lighteval|mmlu:abstract_algebra|5|0": { + "leaderboard|mmlu:abstract_algebra|5|0": { "acc": 0.26, "acc_stderr": 0.04408440022768081, }, - "lighteval|mmlu:college_chemistry|5|0": { + "leaderboard|mmlu:college_chemistry|5|0": { "acc": 0.24, "acc_stderr": 0.04292346959909284, }, - "lighteval|mmlu:computer_security|5|0": { + "leaderboard|mmlu:computer_security|5|0": { "acc": 0.29, "acc_stderr": 0.04560480215720684, }, - "lighteval|mmlu:us_foreign_policy|5|0": { + "leaderboard|mmlu:us_foreign_policy|5|0": { "acc": 0.22, "acc_stderr": 0.041633319989322695, }, - "lighteval|truthfulqa:mc|0|0": { + "leaderboard|truthfulqa:mc|0|0": { "truthfulqa_mc1": 0.22031823745410037, "truthfulqa_mc1_stderr": 0.0145090451714873, "truthfulqa_mc2": 0.3853407807086726, @@ -104,42 +104,42 @@ "pqem": 0.2526389165504879, "pqem_stderr": 0.004336375492801806, }, - "lighteval|gsm8k|5|0": {"qem": 0.009097801364670205, "qem_stderr": 0.002615326510775673}, + "leaderboard|gsm8k|5|0": {"qem": 0.009097801364670205, "qem_stderr": 0.002615326510775673}, # "gsm8k": {"acc": 0.009097801364670205, "acc_stderr": 0.002615326510775673}, Actual harness results }, "gpt2": { "lighteval|anli:r1|0|0": {"acc": 0.341, "acc_stderr": 0.014998131348402704}, "lighteval|blimp:adjunct_island|0|0": {"acc": 0.913, "acc_stderr": 0.00891686663074591}, "lighteval|blimp:ellipsis_n_bar_1|0|0": {"acc": 0.842, "acc_stderr": 0.011539894677559568}, - "lighteval|arc:challenge|25|0": { + "leaderboard|arc:challenge|25|0": { "acc": 0.20051194539249148, "acc_stderr": 0.011700318050499373, "acc_norm": 0.21928327645051193, "acc_norm_stderr": 0.012091245787615723, }, - "lighteval|hellaswag|10|0": { + "leaderboard|hellaswag|10|0": { "acc": 0.29267078271260705, "acc_stderr": 0.004540586983229992, "acc_norm": 0.3157737502489544, "acc_norm_stderr": 0.0046387332023738815, }, - "lighteval|mmlu:abstract_algebra|5|0": { + "leaderboard|mmlu:abstract_algebra|5|0": { "acc": 0.21, "acc_stderr": 0.040936018074033256, }, - "lighteval|mmlu:college_chemistry|5|0": { + "leaderboard|mmlu:college_chemistry|5|0": { "acc": 0.2, "acc_stderr": 0.04020151261036846, }, - "lighteval|mmlu:computer_security|5|0": { + "leaderboard|mmlu:computer_security|5|0": { "acc": 0.16, "acc_stderr": 0.03684529491774709, }, - "lighteval|mmlu:us_foreign_policy|5|0": { + "leaderboard|mmlu:us_foreign_policy|5|0": { "acc": 0.27, "acc_stderr": 0.04461960433384739, }, - "lighteval|truthfulqa:mc|0|0": { + "leaderboard|truthfulqa:mc|0|0": { "truthfulqa_mc1": 0.22766217870257038, "truthfulqa_mc1_stderr": 0.01467925503211107, "truthfulqa_mc2": 0.40693581786045147, @@ -189,8 +189,7 @@ "pqem": 0.24427404899422425, "pqem_stderr": 0.00428777857558616, }, - "lighteval|gsm8k|5|0": {"qem": 0.006065200909780136, "qem_stderr": 0.0021386703014604626}, - # "harness|gsm8k|5|0": {"acc": 0.004548900682335102, "acc_stderr": 0.0018535550440036204}, Actual harness results + "leaderboard|gsm8k|5|0": {"qem": 0.006065200909780136, "qem_stderr": 0.0021386703014604626}, }, } @@ -200,35 +199,35 @@ "lighteval|anli:r1|0|0": {"acc": 0.4, "acc_stderr": 0.16329931618554522}, "lighteval|blimp:adjunct_island|0|0": {"acc": 0.9, "acc_stderr": 0.09999999999999999}, "lighteval|blimp:ellipsis_n_bar_1|0|0": {"acc": 0.8, "acc_stderr": 0.13333333333333333}, - "lighteval|arc:challenge|25|0": { + "leaderboard|arc:challenge|25|0": { "acc": 0.2, "acc_stderr": 0.13333333333333333, "acc_norm": 0.1, "acc_norm_stderr": 0.09999999999999999, }, - "lighteval|hellaswag|10|0": { + "leaderboard|hellaswag|10|0": { "acc": 0.4, "acc_stderr": 0.16329931618554522, "acc_norm": 0.8, "acc_norm_stderr": 0.13333333333333333, }, - "lighteval|mmlu:abstract_algebra|5|0": { + "leaderboard|mmlu:abstract_algebra|5|0": { "acc": 0.3, "acc_stderr": 0.15275252316519466, }, - "lighteval|mmlu:college_chemistry|5|0": { + "leaderboard|mmlu:college_chemistry|5|0": { "acc": 0.2, "acc_stderr": 0.13333333333333333, }, - "lighteval|mmlu:computer_security|5|0": { + "leaderboard|mmlu:computer_security|5|0": { "acc": 0.4, "acc_stderr": 0.1632993161855452, }, - "lighteval|mmlu:us_foreign_policy|5|0": { + "leaderboard|mmlu:us_foreign_policy|5|0": { "acc": 0.3, "acc_stderr": 0.15275252316519466, }, - "lighteval|truthfulqa:mc|0|0": { + "leaderboard|truthfulqa:mc|0|0": { "truthfulqa_mc1": 0.3, "truthfulqa_mc1_stderr": 0.15275252316519466, "truthfulqa_mc2": 0.4528717362471066, @@ -278,41 +277,41 @@ "pqem": 0.2, "pqem_stderr": 0.13333333333333333, }, - "lighteval|gsm8k|5|0": {"qem": 0.1, "qem_stderr": 0.09999999999999999}, + "leaderboard|gsm8k|5|0": {"qem": 0.1, "qem_stderr": 0.09999999999999999}, }, "gpt2": { "lighteval|anli:r1|0|0": {"acc": 0.5, "acc_stderr": 0.16666666666666666}, "lighteval|blimp:adjunct_island|0|0": {"acc": 0.8, "acc_stderr": 0.13333333333333333}, "lighteval|blimp:ellipsis_n_bar_1|0|0": {"acc": 0.7, "acc_stderr": 0.15275252316519466}, - "lighteval|arc:challenge|25|0": { + "leaderboard|arc:challenge|25|0": { "acc": 0.3, "acc_stderr": 0.15275252316519466, "acc_norm": 0.0, "acc_norm_stderr": 0.0, }, - "lighteval|hellaswag|10|0": { + "leaderboard|hellaswag|10|0": { "acc": 0.4, "acc_stderr": 0.16329931618554522, "acc_norm": 0.6, "acc_norm_stderr": 0.16329931618554522, }, - "lighteval|mmlu:abstract_algebra|5|0": { + "leaderboard|mmlu:abstract_algebra|5|0": { "acc": 0.4, "acc_stderr": 0.16329931618554522, }, - "lighteval|mmlu:college_chemistry|5|0": { + "leaderboard|mmlu:college_chemistry|5|0": { "acc": 0.1, "acc_stderr": 0.09999999999999999, }, - "lighteval|mmlu:computer_security|5|0": { + "leaderboard|mmlu:computer_security|5|0": { "acc": 0.1, "acc_stderr": 0.09999999999999999, }, - "lighteval|mmlu:us_foreign_policy|5|0": { + "leaderboard|mmlu:us_foreign_policy|5|0": { "acc": 0.3, "acc_stderr": 0.15275252316519466, }, - "lighteval|truthfulqa:mc|0|0": { + "leaderboard|truthfulqa:mc|0|0": { "truthfulqa_mc1": 0.3, "truthfulqa_mc1_stderr": 0.15275252316519466, "truthfulqa_mc2": 0.4175889390166028, @@ -362,6 +361,6 @@ "pqem": 0.1, "pqem_stderr": 0.09999999999999999, }, - "lighteval|gsm8k|5|0": {"qem": 0.0, "qem_stderr": 0.0}, + "leaderboard|gsm8k|5|0": {"qem": 0.0, "qem_stderr": 0.0}, }, } diff --git a/tests/reference_scores/reference_tasks.py b/tests/reference_scores/reference_tasks.py index 8ca5a7ff3..91cefba84 100644 --- a/tests/reference_scores/reference_tasks.py +++ b/tests/reference_scores/reference_tasks.py @@ -22,29 +22,29 @@ # todo: add original once we are sure of the results MMLU_SUBSET = [ - "lighteval|mmlu:abstract_algebra|5|0", + "leaderboard|mmlu:abstract_algebra|5|0", "helm|mmlu:abstract_algebra|5|0", # "original|mmlu:abstract_algebra|5", - "lighteval|mmlu:college_chemistry|5|0", + "leaderboard|mmlu:college_chemistry|5|0", "helm|mmlu:college_chemistry|5|0", # "original|mmlu:college_chemistry|5", - "lighteval|mmlu:computer_security|5|0", + "leaderboard|mmlu:computer_security|5|0", "helm|mmlu:computer_security|5|0", # "original|mmlu:computer_security|5", - "lighteval|mmlu:us_foreign_policy|5|0", + "leaderboard|mmlu:us_foreign_policy|5|0", "helm|mmlu:us_foreign_policy|5|0", # "original|mmlu:us_foreign_policy|5", ] LEADERBOARD_SUBSET = [ - "lighteval|arc:challenge|25|0", - "lighteval|truthfulqa:mc|0|0", - "lighteval|hellaswag|10|0", - "lighteval|mmlu:abstract_algebra|5|0", - "lighteval|mmlu:college_chemistry|5|0", - "lighteval|mmlu:computer_security|5|0", - "lighteval|mmlu:us_foreign_policy|5|0", - "lighteval|gsm8k|5|0", + "leaderboard|arc:challenge|25|0", + "leaderboard|truthfulqa:mc|0|0", + "leaderboard|hellaswag|10|0", + "leaderboard|mmlu:abstract_algebra|5|0", + "leaderboard|mmlu:college_chemistry|5|0", + "leaderboard|mmlu:computer_security|5|0", + "leaderboard|mmlu:us_foreign_policy|5|0", + "leaderboard|gsm8k|5|0", ] STABLE_SUBSET = [