Skip to content

Commit

Permalink
Change the eos condition for GSM8K (#85)
Browse files Browse the repository at this point in the history
We now cover both leaderboard|GSM8K to reproduce the leaderboard's score and lighteval|GSM8K
  • Loading branch information
clefourrier authored Mar 6, 2024
1 parent 3b0aa23 commit 9b3813f
Show file tree
Hide file tree
Showing 10 changed files with 301 additions and 298 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ Here, `--override_batch_size` defines the _batch size per device_, so the effect
```shell
accelerate launch --multi_gpu --num_processes=8 run_evals_accelerate.py \
--model_args "pretrained=gpt2" \
--tasks "lighteval|truthfulqa:mc|0|0,lighteval|gsm8k|0|0" \
--tasks "leaderboard|truthfulqa:mc|0|0,leaderboard|gsm8k|0|0" \
--override_batch_size 1 \
--output_dir="./evals/"
```
Expand Down
2 changes: 1 addition & 1 deletion run_evals_accelerate.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

""" Example run command:
accelerate config
accelerate launch run_evals_accelerate.py --tasks="lighteval|hellaswag|5|1" --output_dir "/scratch/evals" --model_args "pretrained=gpt2"
accelerate launch run_evals_accelerate.py --tasks="leaderboard|hellaswag|5|1" --output_dir "/scratch/evals" --model_args "pretrained=gpt2"
"""
import argparse

Expand Down
2 changes: 1 addition & 1 deletion src/lighteval/logging/evaluation_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ def recreate_metadata_card(self, repo_id: str, model_name: str = None) -> None:
# in the iso date, the `:` are replaced by `-` because windows does not allow `:` in their filenames

task_name = os.path.basename(sub_file).replace("details_", "").split("_2023")[0].split("_2024")[0]
# task_name is then equal to `lighteval|mmlu:us_foreign_policy|5`
# task_name is then equal to `leaderboard|mmlu:us_foreign_policy|5`

iso_date = os.path.dirname(sub_file)
# to be able to parse the filename as iso dates, we need to re-replace the `-` with `:`
Expand Down
9 changes: 6 additions & 3 deletions src/lighteval/tasks/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,12 @@
from lighteval.tasks.lighteval_task import LightevalTask, LightevalTaskConfig


# original is the reimplementation of original evals
# custom is to play around
DEFAULT_SUITES = ["helm", "bigbench", "harness", "lighteval", "original", "custom", "community"]
# Helm, Bigbench, Harness are implementations following an evaluation suite setup
# Original follows the original implementation as closely as possible
# Leaderboard are the evaluations we fixed on the open llm leaderboard - you should get similar results
# Community are for community added evaluations
# Custom is for all the experiments you might want to do!
DEFAULT_SUITES = ["helm", "bigbench", "harness", "leaderboard", "lighteval", "original", "custom", "community"]

TRUNCATE_FEW_SHOTS_DEFAULTS = True

Expand Down
125 changes: 63 additions & 62 deletions src/lighteval/tasks/tasks_table.jsonl

Large diffs are not rendered by default.

124 changes: 62 additions & 62 deletions tasks_examples/all_tasks.txt
Original file line number Diff line number Diff line change
Expand Up @@ -628,7 +628,7 @@ lighteval|anli:r1|0|0
lighteval|anli:r2|0|0
lighteval|anli:r3|0|0
lighteval|anli|0|0
lighteval|arc:challenge|0|0
leaderboard|arc:challenge|0|0
lighteval|arc:easy|0|0
lighteval|arithmetic:1dc|0|0
lighteval|arithmetic:2da|0|0
Expand Down Expand Up @@ -726,10 +726,10 @@ lighteval|glue:rte|0|0
lighteval|glue:sst2|0|0
lighteval|glue:stsb|0|0
lighteval|glue:wnli|0|0
lighteval|gsm8k|0|0
leaderboard|gsm8k|0|0
lighteval|headqa:en|0|0
lighteval|headqa:es|0|0
lighteval|hellaswag|0|0
leaderboard|hellaswag|0|0
lighteval|iwslt17:ar-en|0|0
lighteval|iwslt17:de-en|0|0
lighteval|iwslt17:en-ar|0|0
Expand Down Expand Up @@ -771,63 +771,63 @@ lighteval|mgsm:sw|0|0
lighteval|mgsm:te|0|0
lighteval|mgsm:th|0|0
lighteval|mgsm:zh|0|0
lighteval|mmlu:abstract_algebra|0|0
lighteval|mmlu:anatomy|0|0
lighteval|mmlu:astronomy|0|0
lighteval|mmlu:business_ethics|0|0
lighteval|mmlu:clinical_knowledge|0|0
lighteval|mmlu:college_biology|0|0
lighteval|mmlu:college_chemistry|0|0
lighteval|mmlu:college_computer_science|0|0
lighteval|mmlu:college_mathematics|0|0
lighteval|mmlu:college_medicine|0|0
lighteval|mmlu:college_physics|0|0
lighteval|mmlu:computer_security|0|0
lighteval|mmlu:conceptual_physics|0|0
lighteval|mmlu:econometrics|0|0
lighteval|mmlu:electrical_engineering|0|0
lighteval|mmlu:elementary_mathematics|0|0
lighteval|mmlu:formal_logic|0|0
lighteval|mmlu:global_facts|0|0
lighteval|mmlu:high_school_biology|0|0
lighteval|mmlu:high_school_chemistry|0|0
lighteval|mmlu:high_school_computer_science|0|0
lighteval|mmlu:high_school_european_history|0|0
lighteval|mmlu:high_school_geography|0|0
lighteval|mmlu:high_school_government_and_politics|0|0
lighteval|mmlu:high_school_macroeconomics|0|0
lighteval|mmlu:high_school_mathematics|0|0
lighteval|mmlu:high_school_microeconomics|0|0
lighteval|mmlu:high_school_physics|0|0
lighteval|mmlu:high_school_psychology|0|0
lighteval|mmlu:high_school_statistics|0|0
lighteval|mmlu:high_school_us_history|0|0
lighteval|mmlu:high_school_world_history|0|0
lighteval|mmlu:human_aging|0|0
lighteval|mmlu:human_sexuality|0|0
lighteval|mmlu:international_law|0|0
lighteval|mmlu:jurisprudence|0|0
lighteval|mmlu:logical_fallacies|0|0
lighteval|mmlu:machine_learning|0|0
lighteval|mmlu:management|0|0
lighteval|mmlu:marketing|0|0
lighteval|mmlu:medical_genetics|0|0
lighteval|mmlu:miscellaneous|0|0
lighteval|mmlu:moral_disputes|0|0
lighteval|mmlu:moral_scenarios|0|0
lighteval|mmlu:nutrition|0|0
lighteval|mmlu:philosophy|0|0
lighteval|mmlu:prehistory|0|0
lighteval|mmlu:professional_accounting|0|0
lighteval|mmlu:professional_law|0|0
lighteval|mmlu:professional_medicine|0|0
lighteval|mmlu:professional_psychology|0|0
lighteval|mmlu:public_relations|0|0
lighteval|mmlu:security_studies|0|0
lighteval|mmlu:sociology|0|0
lighteval|mmlu:us_foreign_policy|0|0
lighteval|mmlu:virology|0|0
lighteval|mmlu:world_religions|0|0
leaderboard|mmlu:abstract_algebra|0|0
leaderboard|mmlu:anatomy|0|0
leaderboard|mmlu:astronomy|0|0
leaderboard|mmlu:business_ethics|0|0
leaderboard|mmlu:clinical_knowledge|0|0
leaderboard|mmlu:college_biology|0|0
leaderboard|mmlu:college_chemistry|0|0
leaderboard|mmlu:college_computer_science|0|0
leaderboard|mmlu:college_mathematics|0|0
leaderboard|mmlu:college_medicine|0|0
leaderboard|mmlu:college_physics|0|0
leaderboard|mmlu:computer_security|0|0
leaderboard|mmlu:conceptual_physics|0|0
leaderboard|mmlu:econometrics|0|0
leaderboard|mmlu:electrical_engineering|0|0
leaderboard|mmlu:elementary_mathematics|0|0
leaderboard|mmlu:formal_logic|0|0
leaderboard|mmlu:global_facts|0|0
leaderboard|mmlu:high_school_biology|0|0
leaderboard|mmlu:high_school_chemistry|0|0
leaderboard|mmlu:high_school_computer_science|0|0
leaderboard|mmlu:high_school_european_history|0|0
leaderboard|mmlu:high_school_geography|0|0
leaderboard|mmlu:high_school_government_and_politics|0|0
leaderboard|mmlu:high_school_macroeconomics|0|0
leaderboard|mmlu:high_school_mathematics|0|0
leaderboard|mmlu:high_school_microeconomics|0|0
leaderboard|mmlu:high_school_physics|0|0
leaderboard|mmlu:high_school_psychology|0|0
leaderboard|mmlu:high_school_statistics|0|0
leaderboard|mmlu:high_school_us_history|0|0
leaderboard|mmlu:high_school_world_history|0|0
leaderboard|mmlu:human_aging|0|0
leaderboard|mmlu:human_sexuality|0|0
leaderboard|mmlu:international_law|0|0
leaderboard|mmlu:jurisprudence|0|0
leaderboard|mmlu:logical_fallacies|0|0
leaderboard|mmlu:machine_learning|0|0
leaderboard|mmlu:management|0|0
leaderboard|mmlu:marketing|0|0
leaderboard|mmlu:medical_genetics|0|0
leaderboard|mmlu:miscellaneous|0|0
leaderboard|mmlu:moral_disputes|0|0
leaderboard|mmlu:moral_scenarios|0|0
leaderboard|mmlu:nutrition|0|0
leaderboard|mmlu:philosophy|0|0
leaderboard|mmlu:prehistory|0|0
leaderboard|mmlu:professional_accounting|0|0
leaderboard|mmlu:professional_law|0|0
leaderboard|mmlu:professional_medicine|0|0
leaderboard|mmlu:professional_psychology|0|0
leaderboard|mmlu:public_relations|0|0
leaderboard|mmlu:security_studies|0|0
leaderboard|mmlu:sociology|0|0
leaderboard|mmlu:us_foreign_policy|0|0
leaderboard|mmlu:virology|0|0
leaderboard|mmlu:world_religions|0|0
lighteval|mtnt2019:en-fr|0|0
lighteval|mtnt2019:en-ja|0|0
lighteval|mtnt2019:fr-en|0|0
Expand Down Expand Up @@ -881,15 +881,15 @@ lighteval|the_pile:youtubesubtitles|0|0
lighteval|toxigen|0|0
lighteval|triviaqa|0|0
lighteval|truthfulqa:gen|0|0
lighteval|truthfulqa:mc|0|0
leaderboard|truthfulqa:mc|0|0
lighteval|unscramble:anagrams1|0|0
lighteval|unscramble:anagrams2|0|0
lighteval|unscramble:cycle_letters|0|0
lighteval|unscramble:random_insertion|0|0
lighteval|unscramble:reversed_words|0|0
lighteval|webqs|0|0
lighteval|wikitext|0|0
lighteval|winogrande|0|0
leaderboard|winogrande|0|0
lighteval|wmt08:cs-en|0|0
lighteval|wmt08:de-en|0|0
lighteval|wmt08:en-cs|0|0
Expand Down
124 changes: 62 additions & 62 deletions tasks_examples/open_llm_leaderboard_tasks.txt
Original file line number Diff line number Diff line change
@@ -1,68 +1,68 @@
# ARC
lighteval|arc:challenge|25|0
leaderboard|arc:challenge|25|0
# HellaSwag
lighteval|hellaswag|10|0
leaderboard|hellaswag|10|0
# TruthfulQA
lighteval|truthfulqa:mc|0|0
leaderboard|truthfulqa:mc|0|0
# MMLU
lighteval|mmlu:abstract_algebra|5|0
lighteval|mmlu:anatomy|5|0
lighteval|mmlu:astronomy|5|0
lighteval|mmlu:business_ethics|5|0
lighteval|mmlu:clinical_knowledge|5|0
lighteval|mmlu:college_biology|5|0
lighteval|mmlu:college_chemistry|5|0
lighteval|mmlu:college_computer_science|5|0
lighteval|mmlu:college_mathematics|5|0
lighteval|mmlu:college_medicine|5|0
lighteval|mmlu:college_physics|5|0
lighteval|mmlu:computer_security|5|0
lighteval|mmlu:conceptual_physics|5|0
lighteval|mmlu:econometrics|5|0
lighteval|mmlu:electrical_engineering|5|0
lighteval|mmlu:elementary_mathematics|5|0
lighteval|mmlu:formal_logic|5|0
lighteval|mmlu:global_facts|5|0
lighteval|mmlu:high_school_biology|5|0
lighteval|mmlu:high_school_chemistry|5|0
lighteval|mmlu:high_school_computer_science|5|0
lighteval|mmlu:high_school_european_history|5|0
lighteval|mmlu:high_school_geography|5|0
lighteval|mmlu:high_school_government_and_politics|5|0
lighteval|mmlu:high_school_macroeconomics|5|0
lighteval|mmlu:high_school_mathematics|5|0
lighteval|mmlu:high_school_microeconomics|5|0
lighteval|mmlu:high_school_physics|5|0
lighteval|mmlu:high_school_psychology|5|0
lighteval|mmlu:high_school_statistics|5|0
lighteval|mmlu:high_school_us_history|5|0
lighteval|mmlu:high_school_world_history|5|0
lighteval|mmlu:human_aging|5|0
lighteval|mmlu:human_sexuality|5|0
lighteval|mmlu:international_law|5|0
lighteval|mmlu:jurisprudence|5|0
lighteval|mmlu:logical_fallacies|5|0
lighteval|mmlu:machine_learning|5|0
lighteval|mmlu:management|5|0
lighteval|mmlu:marketing|5|0
lighteval|mmlu:medical_genetics|5|0
lighteval|mmlu:miscellaneous|5|0
lighteval|mmlu:moral_disputes|5|0
lighteval|mmlu:moral_scenarios|5|0
lighteval|mmlu:nutrition|5|0
lighteval|mmlu:philosophy|5|0
lighteval|mmlu:prehistory|5|0
lighteval|mmlu:professional_accounting|5|0
lighteval|mmlu:professional_law|5|0
lighteval|mmlu:professional_medicine|5|0
lighteval|mmlu:professional_psychology|5|0
lighteval|mmlu:public_relations|5|0
lighteval|mmlu:security_studies|5|0
lighteval|mmlu:sociology|5|0
lighteval|mmlu:us_foreign_policy|5|0
lighteval|mmlu:virology|5|0
lighteval|mmlu:world_religions|5|0
leaderboard|mmlu:abstract_algebra|5|0
leaderboard|mmlu:anatomy|5|0
leaderboard|mmlu:astronomy|5|0
leaderboard|mmlu:business_ethics|5|0
leaderboard|mmlu:clinical_knowledge|5|0
leaderboard|mmlu:college_biology|5|0
leaderboard|mmlu:college_chemistry|5|0
leaderboard|mmlu:college_computer_science|5|0
leaderboard|mmlu:college_mathematics|5|0
leaderboard|mmlu:college_medicine|5|0
leaderboard|mmlu:college_physics|5|0
leaderboard|mmlu:computer_security|5|0
leaderboard|mmlu:conceptual_physics|5|0
leaderboard|mmlu:econometrics|5|0
leaderboard|mmlu:electrical_engineering|5|0
leaderboard|mmlu:elementary_mathematics|5|0
leaderboard|mmlu:formal_logic|5|0
leaderboard|mmlu:global_facts|5|0
leaderboard|mmlu:high_school_biology|5|0
leaderboard|mmlu:high_school_chemistry|5|0
leaderboard|mmlu:high_school_computer_science|5|0
leaderboard|mmlu:high_school_european_history|5|0
leaderboard|mmlu:high_school_geography|5|0
leaderboard|mmlu:high_school_government_and_politics|5|0
leaderboard|mmlu:high_school_macroeconomics|5|0
leaderboard|mmlu:high_school_mathematics|5|0
leaderboard|mmlu:high_school_microeconomics|5|0
leaderboard|mmlu:high_school_physics|5|0
leaderboard|mmlu:high_school_psychology|5|0
leaderboard|mmlu:high_school_statistics|5|0
leaderboard|mmlu:high_school_us_history|5|0
leaderboard|mmlu:high_school_world_history|5|0
leaderboard|mmlu:human_aging|5|0
leaderboard|mmlu:human_sexuality|5|0
leaderboard|mmlu:international_law|5|0
leaderboard|mmlu:jurisprudence|5|0
leaderboard|mmlu:logical_fallacies|5|0
leaderboard|mmlu:machine_learning|5|0
leaderboard|mmlu:management|5|0
leaderboard|mmlu:marketing|5|0
leaderboard|mmlu:medical_genetics|5|0
leaderboard|mmlu:miscellaneous|5|0
leaderboard|mmlu:moral_disputes|5|0
leaderboard|mmlu:moral_scenarios|5|0
leaderboard|mmlu:nutrition|5|0
leaderboard|mmlu:philosophy|5|0
leaderboard|mmlu:prehistory|5|0
leaderboard|mmlu:professional_accounting|5|0
leaderboard|mmlu:professional_law|5|0
leaderboard|mmlu:professional_medicine|5|0
leaderboard|mmlu:professional_psychology|5|0
leaderboard|mmlu:public_relations|5|0
leaderboard|mmlu:security_studies|5|0
leaderboard|mmlu:sociology|5|0
leaderboard|mmlu:us_foreign_policy|5|0
leaderboard|mmlu:virology|5|0
leaderboard|mmlu:world_religions|5|0
# WinoGrande
lighteval|winogrande|5|0
leaderboard|winogrande|5|0
# GSM8K
lighteval|gsm8k|5|0
leaderboard|gsm8k|5|0
Loading

0 comments on commit 9b3813f

Please sign in to comment.