From 0c808016f2937360e5c4e11f38e4c0372e704861 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 28 Nov 2024 20:29:54 +0100 Subject: [PATCH] Set up docs (#403) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add docs * Add wiki to docs * Adapt wiki as docs * Force docs build * Fix link in _toctree * Add titles to docs pages * Update docs/source/evaluate-the-model-on-a-server-or-container.mdx Co-authored-by: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com> --------- Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> Co-authored-by: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com> --- .github/workflows/doc-build.yml | 18 + .github/workflows/doc-pr-build.yml | 16 + docs/source/_toctree.yml | 30 + docs/source/adding-a-custom-task.mdx | 196 +++ docs/source/adding-a-new-metric.mdx | 93 ++ docs/source/available-tasks.mdx | 1250 +++++++++++++++++ ...ntributing-to-multilingual-evaluations.mdx | 107 ++ ...ate-the-model-on-a-server-or-container.mdx | 67 + docs/source/index.mdx | 18 + docs/source/installation.mdx | 46 + docs/source/metric-list.mdx | 76 + docs/source/quicktour.mdx | 160 +++ docs/source/saving-and-reading-results.mdx | 214 +++ docs/source/use-vllm-as-backend.mdx | 53 + docs/source/using-the-python-api.mdx | 62 + pyproject.toml | 1 + 16 files changed, 2407 insertions(+) create mode 100644 .github/workflows/doc-build.yml create mode 100644 .github/workflows/doc-pr-build.yml create mode 100644 docs/source/_toctree.yml create mode 100644 docs/source/adding-a-custom-task.mdx create mode 100644 docs/source/adding-a-new-metric.mdx create mode 100644 docs/source/available-tasks.mdx create mode 100644 docs/source/contributing-to-multilingual-evaluations.mdx create mode 100644 docs/source/evaluate-the-model-on-a-server-or-container.mdx create mode 100644 docs/source/index.mdx create mode 100644 docs/source/installation.mdx create mode 100644 docs/source/metric-list.mdx create mode 100644 docs/source/quicktour.mdx create mode 100644 docs/source/saving-and-reading-results.mdx create mode 100644 docs/source/use-vllm-as-backend.mdx create mode 100644 docs/source/using-the-python-api.mdx diff --git a/.github/workflows/doc-build.yml b/.github/workflows/doc-build.yml new file mode 100644 index 000000000..cd345d3d3 --- /dev/null +++ b/.github/workflows/doc-build.yml @@ -0,0 +1,18 @@ +name: Build Documentation + +on: + push: + branches: + - main + - doc-builder* + - v*-release + +jobs: + build: + uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main + with: + commit_sha: ${{ github.sha }} + package: lighteval + secrets: + token: ${{ secrets.HUGGINGFACE_PUSH }} + hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }} diff --git a/.github/workflows/doc-pr-build.yml b/.github/workflows/doc-pr-build.yml new file mode 100644 index 000000000..f96e20583 --- /dev/null +++ b/.github/workflows/doc-pr-build.yml @@ -0,0 +1,16 @@ +name: Build PR Documentation + +on: + pull_request: + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +jobs: + build: + uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main + with: + commit_sha: ${{ github.event.pull_request.head.sha }} + pr_number: ${{ github.event.number }} + package: lighteval diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml new file mode 100644 index 000000000..243462b3d --- /dev/null +++ b/docs/source/_toctree.yml @@ -0,0 +1,30 @@ +- sections: + - local: index + title: 🤗 Lighteval + - local: installation + title: Installation + - local: quicktour + title: Quicktour + title: Getting started +- sections: + - local: saving-and-reading-results + title: Save and read results + - local: using-the-python-api + title: Use the Python API + - local: adding-a-custom-task + title: Add a custom task + - local: adding-a-new-metric + title: Add a custom metric + - local: use-vllm-as-backend + title: Use VLLM as backend + - local: evaluate-the-model-on-a-server-or-container + title: Evaluate on Server + - local: contributing-to-multilingual-evaluations + title: Contributing to multilingual evaluations + title: Guides +- sections: + - local: metric-list + title: Available Metrics + - local: available-tasks + title: Available Tasks + title: API diff --git a/docs/source/adding-a-custom-task.mdx b/docs/source/adding-a-custom-task.mdx new file mode 100644 index 000000000..bcaa932ff --- /dev/null +++ b/docs/source/adding-a-custom-task.mdx @@ -0,0 +1,196 @@ +# Adding a Custom Task + +To add a new task, first either open an issue, to determine whether it will be +integrated in the core evaluations of lighteval, in the extended tasks, or the +community tasks, and add its dataset on the hub. + +- Core evaluations are evaluations that only require standard logic in their + metrics and processing, and that we will add to our test suite to ensure non + regression through time. They already see high usage in the community. +- Extended evaluations are evaluations that require custom logic in their + metrics (complex normalisation, an LLM as a judge, ...), that we added to + facilitate the life of users. They already see high usage in the community. +- Community evaluations are submissions by the community of new tasks. + +A popular community evaluation can move to become an extended or core evaluation over time. + +> [!TIP] +> You can find examples of custom tasks in the community_task directory. + +## Step by step creation of a custom task + +> [!WARNING] +> To contribute your custom metric to the lighteval repo, you would first need +> to install the required dev dependencies by running `pip install -e .[dev]` +> and then run `pre-commit install` to install the pre-commit hooks. + +First, create a python file under the `community_tasks` directory. + +You need to define a prompt function that will convert a line from your +dataset to a document to be used for evaluation. + +```python +# Define as many as you need for your different tasks +def prompt_fn(line, task_name: str = None): + """Defines how to go from a dataset line to a doc object. + Follow examples in src/lighteval/tasks/default_prompts.py, or get more info + about what this function should do in the README. + """ + return Doc( + task_name=task_name, + query=line["question"], + choices=[f" {c}" for c in line["choices"]], + gold_index=line["gold"], + instruction="", + ) +``` + +Then, you need to choose a metric, you can either use an existing one (defined +in `lighteval/metrics/metrics.py`) or [create a custom one](adding-a-new-metric)). + +```python +custom_metric = SampleLevelMetric( + metric_name="my_custom_metric_name", + higher_is_better=True, + category=MetricCategory.IGNORED, + use_case=MetricUseCase.NONE, + sample_level_fn=lambda x: x, # how to compute score for one sample + corpus_level_fn=np.mean, # How to aggreagte the samples metrics +) +``` + +Then, you need to define your task. You can define a task with or without subsets. +To define a task with no subsets: + +```python +# This is how you create a simple task (like hellaswag) which has one single subset +# attached to it, and one evaluation possible. +task = LightevalTaskConfig( + name="myothertask", + prompt_function=prompt_fn, # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py + suite=["community"], + hf_repo="", + hf_subset="default", + hf_avail_splits=[], + evaluation_splits=[], + few_shots_split=None, + few_shots_select=None, + metric=[], # select your metric in Metrics +) +``` + +If you want to create a task with multiple subset, add them to the +`SAMPLE_SUBSETS` list and create a task for each subset. + +```python +SAMPLE_SUBSETS = [] # list of all the subsets to use for this eval + + +class CustomSubsetTask(LightevalTaskConfig): + def __init__( + self, + name, + hf_subset, + ): + super().__init__( + name=name, + hf_subset=hf_subset, + prompt_function=prompt_fn, # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py + hf_repo="", + metric=[custom_metric], # select your metric in Metrics or use your custom_metric + hf_avail_splits=[], + evaluation_splits=[], + few_shots_split=None, + few_shots_select=None, + suite=["community"], + generation_size=-1, + stop_sequence=None, + output_regex=None, + frozen=False, + ) +SUBSET_TASKS = [CustomSubsetTask(name=f"mytask:{subset}", hf_subset=subset) for subset in SAMPLE_SUBSETS] +``` + +Here is a list of the parameters and their meaning: + +- `name` (str), your evaluation name +- `suite` (list), the suite(s) to which your evaluation should belong. This + field allows us to compare different task implementations and is used as a + task selection to differentiate the versions to launch. At the moment, you'll + find the keywords ["helm", "bigbench", "original", "lighteval", "community", + "custom"]; for core evals, please choose `lighteval`. +- `prompt_function` (Callable), the prompt function you defined in the step + above +- `hf_repo` (str), the path to your evaluation dataset on the hub +- `hf_subset` (str), the specific subset you want to use for your evaluation + (note: when the dataset has no subset, fill this field with `"default"`, not + with `None` or `""`) +- `hf_avail_splits` (list), all the splits available for your dataset (train, + valid or validation, test, other...) +- `evaluation_splits` (list), the splits you want to use for evaluation +- `few_shots_split` (str, can be `null`), the specific split from which you + want to select samples for your few-shot examples. It should be different + from the sets included in `evaluation_splits` +- `few_shots_select` (str, can be `null`), the method that you will use to + select items for your few-shot examples. Can be `null`, or one of: + - `balanced` select examples from the `few_shots_split` with balanced + labels, to avoid skewing the few shot examples (hence the model + generations) toward one specific label + - `random` selects examples at random from the `few_shots_split` + - `random_sampling` selects new examples at random from the + `few_shots_split` for every new item, but if a sampled item is equal to + the current one, it is removed from the available samples + - `random_sampling_from_train` selects new examples at random from the + `few_shots_split` for every new item, but if a sampled item is equal to + the current one, it is kept! Only use this if you know what you are + doing. + - `sequential` selects the first `n` examples of the `few_shots_split` +- `generation_size` (int), the maximum number of tokens allowed for a + generative evaluation. If your evaluation is a log likelihood evaluation + (multi-choice), this value should be -1 +- `stop_sequence` (list), a list of strings acting as end of sentence tokens + for your generation +- `metric` (list), the metrics you want to use for your evaluation (see next + section for a detailed explanation) +- `output_regex` (str), A regex string that will be used to filter your + generation. (Generative metrics will only select tokens that are between the + first and the second sequence matched by the regex. For example, for a regex + matching `\n` and a generation `\nModel generation output\nSome other text` + the metric will only be fed with `Model generation output`) +- `frozen` (bool), for now, is set to False, but we will steadily pass all + stable tasks to True. +- `trust_dataset` (bool), set to True if you trust the dataset. + + +Then you need to add your task to the `TASKS_TABLE` list. + +```python +# STORE YOUR EVALS + +# tasks with subset: +TASKS_TABLE = SUBSET_TASKS + +# tasks without subset: +# TASKS_TABLE = [task] +``` + +Finally, you need to add a module logic to convert your task to a dict for lighteval. + +```python +# MODULE LOGIC +# You should not need to touch this +# Convert to dict for lighteval +if __name__ == "__main__": + print(t.name for t in TASKS_TABLE) + print(len(TASKS_TABLE)) +``` + +Once your file is created you can then run the evaluation with the following command: + +```bash +lighteval accelerate \ + --model_args "pretrained=HuggingFaceH4/zephyr-7b-beta" \ + --tasks "community|{custom_task}|{fewshots}|{truncate_few_shot}" \ + --custom_tasks {path_to_your_custom_task_file} \ + --output_dir "./evals" +``` diff --git a/docs/source/adding-a-new-metric.mdx b/docs/source/adding-a-new-metric.mdx new file mode 100644 index 000000000..e8562af4f --- /dev/null +++ b/docs/source/adding-a-new-metric.mdx @@ -0,0 +1,93 @@ +# Adding a New Metric + +First, check if you can use one of the parametrized functions in +[src.lighteval.metrics.metrics_corpus]() or +[src.lighteval.metrics.metrics_sample](). + +If not, you can use the `custom_task` system to register your new metric: + +> [!TIP] +> To see an example of a custom metric added along with a custom task, look at the IFEval custom task. + + +> [!WARNING] +> To contribute your custom metric to the lighteval repo, you would first need +> to install the required dev dependencies by running `pip install -e .[dev]` +> and then run `pre-commit install` to install the pre-commit hooks. + + +- Create a new Python file which should contain the full logic of your metric. +- The file also needs to start with these imports + +```python +from aenum import extend_enum +from lighteval.metrics import Metrics +``` + +You need to define a sample level metric: + +```python +def custom_metric(predictions: list[str], formatted_doc: Doc, **kwargs) -> bool: + response = predictions[0] + return response == formatted_doc.choices[formatted_doc.gold_index] +``` + +Here the sample level metric only returns one metric, if you want to return multiple metrics per sample you need to return a dictionary with the metrics as keys and the values as values. + +```python +def custom_metric(predictions: list[str], formatted_doc: Doc, **kwargs) -> dict: + response = predictions[0] + return {"accuracy": response == formatted_doc.choices[formatted_doc.gold_index], "other_metric": 0.5} +``` + +Then, you can define an aggregation function if needed, a common aggregation function is `np.mean`. + +```python +def agg_function(items): + flat_items = [item for sublist in items for item in sublist] + score = sum(flat_items) / len(flat_items) + return score +``` + +Finally, you can define your metric. If it's a sample level metric, you can use the following code: + +```python +my_custom_metric = SampleLevelMetric( + metric_name={custom_metric_name}, + higher_is_better={either True or False}, + category={MetricCategory}, + use_case={MetricUseCase}, + sample_level_fn=custom_metric, + corpus_level_fn=agg_function, +) +``` + +If your metric defines multiple metrics per sample, you can use the following code: + +```python +custom_metric = SampleLevelMetricGrouping( + metric_name={submetric_names}, + higher_is_better={n: {True or False} for n in submetric_names}, + category={MetricCategory}, + use_case={MetricUseCase}, + sample_level_fn=custom_metric, + corpus_level_fn={ + "accuracy": np.mean, + "other_metric": agg_function, + }, +) +``` + +To finish, add the following, so that it adds your metric to our metrics list +when loaded as a module. + +```python +# Adds the metric to the metric list! +extend_enum(Metrics, "metric_name", metric_function) +if __name__ == "__main__": + print("Imported metric") +``` + +You can then give your custom metric to lighteval by using `--custom-tasks +path_to_your_file` when launching it. + diff --git a/docs/source/available-tasks.mdx b/docs/source/available-tasks.mdx new file mode 100644 index 000000000..9b167d21e --- /dev/null +++ b/docs/source/available-tasks.mdx @@ -0,0 +1,1250 @@ +# Available Tasks + +You can get a list of all the available tasks by running: + +```bash +lighteval tasks --list +``` + +## List of tasks + +- bigbench: + - bigbench|abstract_narrative_understanding + - bigbench|anachronisms + - bigbench|analogical_similarity + - bigbench|analytic_entailment + - bigbench|arithmetic_bb + - bigbench|ascii_word_recognition + - bigbench|authorship_verification + - bigbench|auto_categorization + - bigbench|auto_debugging + - bigbench|bbq_lite_json + - bigbench|bridging_anaphora_resolution_barqa + - bigbench|causal_judgment + - bigbench|cause_and_effect + - bigbench|checkmate_in_one + - bigbench|chess_state_tracking + - bigbench|chinese_remainder_theorem + - bigbench|cifar10_classification + - bigbench|code_line_description + - bigbench|codenames + - bigbench|color + - bigbench|common_morpheme + - bigbench|conceptual_combinations + - bigbench|conlang_translation + - bigbench|contextual_parametric_knowledge_conflicts + - bigbench|coqa_bb + - bigbench|crash_blossom + - bigbench|crass_ai + - bigbench|cryobiology_spanish + - bigbench|cryptonite + - bigbench|cs_algorithms + - bigbench|dark_humor_detection + - bigbench|date_understanding + - bigbench|disambiguation_qa + - bigbench|discourse_marker_prediction + - bigbench|disfl_qa + - bigbench|dyck_languages + - bigbench|elementary_math_qa + - bigbench|emoji_movie + - bigbench|emojis_emotion_prediction + - bigbench|empirical_judgments + - bigbench|english_proverbs + - bigbench|english_russian_proverbs + - bigbench|entailed_polarity + - bigbench|entailed_polarity_hindi + - bigbench|epistemic_reasoning + - bigbench|evaluating_information_essentiality + - bigbench|fact_checker + - bigbench|fantasy_reasoning + - bigbench|few_shot_nlg + - bigbench|figure_of_speech_detection + - bigbench|formal_fallacies_syllogisms_negation + - bigbench|gem + - bigbench|gender_inclusive_sentences_german + - bigbench|general_knowledge + - bigbench|geometric_shapes + - bigbench|goal_step_wikihow + - bigbench|gre_reading_comprehension + - bigbench|hhh_alignment + - bigbench|hindi_question_answering + - bigbench|hindu_knowledge + - bigbench|hinglish_toxicity + - bigbench|human_organs_senses + - bigbench|hyperbaton + - bigbench|identify_math_theorems + - bigbench|identify_odd_metaphor + - bigbench|implicatures + - bigbench|implicit_relations + - bigbench|intent_recognition + - bigbench|international_phonetic_alphabet_nli + - bigbench|international_phonetic_alphabet_transliterate + - bigbench|intersect_geometry + - bigbench|irony_identification + - bigbench|kanji_ascii + - bigbench|kannada + - bigbench|key_value_maps + - bigbench|known_unknowns + - bigbench|language_games + - bigbench|language_identification + - bigbench|linguistic_mappings + - bigbench|linguistics_puzzles + - bigbench|logic_grid_puzzle + - bigbench|logical_args + - bigbench|logical_deduction + - bigbench|logical_fallacy_detection + - bigbench|logical_sequence + - bigbench|mathematical_induction + - bigbench|matrixshapes + - bigbench|metaphor_boolean + - bigbench|metaphor_understanding + - bigbench|minute_mysteries_qa + - bigbench|misconceptions + - bigbench|misconceptions_russian + - bigbench|mnist_ascii + - bigbench|modified_arithmetic + - bigbench|moral_permissibility + - bigbench|movie_dialog_same_or_different + - bigbench|movie_recommendation + - bigbench|mult_data_wrangling + - bigbench|multiemo + - bigbench|natural_instructions + - bigbench|navigate + - bigbench|nonsense_words_grammar + - bigbench|novel_concepts + - bigbench|object_counting + - bigbench|odd_one_out + - bigbench|operators + - bigbench|paragraph_segmentation + - bigbench|parsinlu_qa + - bigbench|parsinlu_reading_comprehension + - bigbench|penguins_in_a_table + - bigbench|periodic_elements + - bigbench|persian_idioms + - bigbench|phrase_relatedness + - bigbench|physical_intuition + - bigbench|physics + - bigbench|physics_questions + - bigbench|play_dialog_same_or_different + - bigbench|polish_sequence_labeling + - bigbench|presuppositions_as_nli + - bigbench|qa_wikidata + - bigbench|question_selection + - bigbench|real_or_fake_text + - bigbench|reasoning_about_colored_objects + - bigbench|repeat_copy_logic + - bigbench|rephrase + - bigbench|rhyming + - bigbench|riddle_sense + - bigbench|ruin_names + - bigbench|salient_translation_error_detection + - bigbench|scientific_press_release + - bigbench|semantic_parsing_in_context_sparc + - bigbench|semantic_parsing_spider + - bigbench|sentence_ambiguity + - bigbench|similarities_abstraction + - bigbench|simp_turing_concept + - bigbench|simple_arithmetic_json + - bigbench|simple_arithmetic_json_multiple_choice + - bigbench|simple_arithmetic_json_subtasks + - bigbench|simple_arithmetic_multiple_targets_json + - bigbench|simple_ethical_questions + - bigbench|simple_text_editing + - bigbench|snarks + - bigbench|social_iqa + - bigbench|social_support + - bigbench|sports_understanding + - bigbench|strange_stories + - bigbench|strategyqa + - bigbench|sufficient_information + - bigbench|suicide_risk + - bigbench|swahili_english_proverbs + - bigbench|swedish_to_german_proverbs + - bigbench|symbol_interpretation + - bigbench|tellmewhy + - bigbench|temporal_sequences + - bigbench|tense + - bigbench|timedial + - bigbench|topical_chat + - bigbench|tracking_shuffled_objects + - bigbench|understanding_fables + - bigbench|undo_permutation + - bigbench|unit_conversion + - bigbench|unit_interpretation + - bigbench|unnatural_in_context_learning + - bigbench|vitaminc_fact_verification + - bigbench|what_is_the_tao + - bigbench|which_wiki_edit + - bigbench|wino_x_german + - bigbench|winowhy + - bigbench|word_sorting + - bigbench|word_unscrambling + +- harness: + - harness|bbh:boolean_expressions + - harness|bbh:causal_judgment + - harness|bbh:date_understanding + - harness|bbh:disambiguation_qa + - harness|bbh:dyck_languages + - harness|bbh:formal_fallacies + - harness|bbh:geometric_shapes + - harness|bbh:hyperbaton + - harness|bbh:logical_deduction_five_objects + - harness|bbh:logical_deduction_seven_objects + - harness|bbh:logical_deduction_three_objects + - harness|bbh:movie_recommendation + - harness|bbh:multistep_arithmetic_two + - harness|bbh:navigate + - harness|bbh:object_counting + - harness|bbh:penguins_in_a_table + - harness|bbh:reasoning_about_colored_objects + - harness|bbh:ruin_names + - harness|bbh:salient_translation_error_detection + - harness|bbh:snarks + - harness|bbh:sports_understanding + - harness|bbh:temporal_sequences + - harness|bbh:tracking_shuffled_objects_five_objects + - harness|bbh:tracking_shuffled_objects_seven_objects + - harness|bbh:tracking_shuffled_objects_three_objects + - harness|bbh:web_of_lies + - harness|bbh:word_sorting + - harness|bigbench:causal_judgment + - harness|bigbench:date_understanding + - harness|bigbench:disambiguation_qa + - harness|bigbench:geometric_shapes + - harness|bigbench:logical_deduction_five_objects + - harness|bigbench:logical_deduction_seven_objects + - harness|bigbench:logical_deduction_three_objects + - harness|bigbench:movie_recommendation + - harness|bigbench:navigate + - harness|bigbench:reasoning_about_colored_objects + - harness|bigbench:ruin_names + - harness|bigbench:salient_translation_error_detection + - harness|bigbench:snarks + - harness|bigbench:sports_understanding + - harness|bigbench:temporal_sequences + - harness|bigbench:tracking_shuffled_objects_five_objects + - harness|bigbench:tracking_shuffled_objects_seven_objects + - harness|bigbench:tracking_shuffled_objects_three_objects + - harness|wikitext:103:document_level + +- helm: + - helm|babi_qa + - helm|bbq + - helm|bbq:Age + - helm|bbq:Disability_status + - helm|bbq:Gender_identity + - helm|bbq:Physical_appearance + - helm|bbq:Race_ethnicity + - helm|bbq:Race_x_SES + - helm|bbq:Race_x_gender + - helm|bbq:Religion + - helm|bbq:SES + - helm|bbq:Sexual_orientation + - helm|bbq=Nationality + - helm|bigbench:auto_debugging + - helm|bigbench:bbq_lite_json:age_ambig + - helm|bigbench:bbq_lite_json:age_disambig + - helm|bigbench:bbq_lite_json:disability_status_ambig + - helm|bigbench:bbq_lite_json:disability_status_disambig + - helm|bigbench:bbq_lite_json:gender_identity_ambig + - helm|bigbench:bbq_lite_json:gender_identity_disambig + - helm|bigbench:bbq_lite_json:nationality_ambig + - helm|bigbench:bbq_lite_json:nationality_disambig + - helm|bigbench:bbq_lite_json:physical_appearance_ambig + - helm|bigbench:bbq_lite_json:physical_appearance_disambig + - helm|bigbench:bbq_lite_json:race_ethnicity_ambig + - helm|bigbench:bbq_lite_json:race_ethnicity_disambig + - helm|bigbench:bbq_lite_json:religion_ambig + - helm|bigbench:bbq_lite_json:religion_disambig + - helm|bigbench:bbq_lite_json:ses_ambig + - helm|bigbench:bbq_lite_json:ses_disambig + - helm|bigbench:bbq_lite_json:sexual_orientation_ambig + - helm|bigbench:bbq_lite_json:sexual_orientation_disambig + - helm|bigbench:code_line_description + - helm|bigbench:conceptual_combinations:contradictions + - helm|bigbench:conceptual_combinations:emergent_properties + - helm|bigbench:conceptual_combinations:fanciful_fictional_combinations + - helm|bigbench:conceptual_combinations:homonyms + - helm|bigbench:conceptual_combinations:invented_words + - helm|bigbench:conlang_translation:adna_from + - helm|bigbench:conlang_translation:adna_to + - helm|bigbench:conlang_translation:atikampe_from + - helm|bigbench:conlang_translation:atikampe_to + - helm|bigbench:conlang_translation:gornam_from + - helm|bigbench:conlang_translation:gornam_to + - helm|bigbench:conlang_translation:holuan_from + - helm|bigbench:conlang_translation:holuan_to + - helm|bigbench:conlang_translation:mkafala_from + - helm|bigbench:conlang_translation:mkafala_to + - helm|bigbench:conlang_translation:postpositive_english_from + - helm|bigbench:conlang_translation:postpositive_english_to + - helm|bigbench:conlang_translation:unapuri_from + - helm|bigbench:conlang_translation:unapuri_to + - helm|bigbench:conlang_translation:vaomi_from + - helm|bigbench:conlang_translation:vaomi_to + - helm|bigbench:emoji_movie + - helm|bigbench:formal_fallacies_syllogisms_negation + - helm|bigbench:hindu_knowledge + - helm|bigbench:known_unknowns + - helm|bigbench:language_identification + - helm|bigbench:linguistics_puzzles + - helm|bigbench:logic_grid_puzzle + - helm|bigbench:logical_deduction-five_objects + - helm|bigbench:logical_deduction-seven_objects + - helm|bigbench:logical_deduction-three_objects + - helm|bigbench:misconceptions_russian + - helm|bigbench:novel_concepts + - helm|bigbench:operators + - helm|bigbench:parsinlu_reading_comprehension + - helm|bigbench:play_dialog_same_or_different + - helm|bigbench:repeat_copy_logic + - helm|bigbench:strange_stories-boolean + - helm|bigbench:strange_stories-multiple_choice + - helm|bigbench:strategyqa + - helm|bigbench:symbol_interpretation-adversarial + - helm|bigbench:symbol_interpretation-emoji_agnostic + - helm|bigbench:symbol_interpretation-name_agnostic + - helm|bigbench:symbol_interpretation-plain + - helm|bigbench:symbol_interpretation-tricky + - helm|bigbench:vitaminc_fact_verification + - helm|bigbench:winowhy + - helm|blimp:adjunct_island + - helm|blimp:anaphor_gender_agreement + - helm|blimp:anaphor_number_agreement + - helm|blimp:animate_subject_passive + - helm|blimp:animate_subject_trans + - helm|blimp:causative + - helm|blimp:complex_NP_island + - helm|blimp:coordinate_structure_constraint_complex_left_branch + - helm|blimp:coordinate_structure_constraint_object_extraction + - helm|blimp:determiner_noun_agreement_1 + - helm|blimp:determiner_noun_agreement_2 + - helm|blimp:determiner_noun_agreement_irregular_1 + - helm|blimp:determiner_noun_agreement_irregular_2 + - helm|blimp:determiner_noun_agreement_with_adj_2 + - helm|blimp:determiner_noun_agreement_with_adj_irregular_1 + - helm|blimp:determiner_noun_agreement_with_adj_irregular_2 + - helm|blimp:determiner_noun_agreement_with_adjective_1 + - helm|blimp:distractor_agreement_relational_noun + - helm|blimp:distractor_agreement_relative_clause + - helm|blimp:drop_argument + - helm|blimp:ellipsis_n_bar_1 + - helm|blimp:ellipsis_n_bar_2 + - helm|blimp:existential_there_object_raising + - helm|blimp:existential_there_quantifiers_1 + - helm|blimp:existential_there_quantifiers_2 + - helm|blimp:existential_there_subject_raising + - helm|blimp:expletive_it_object_raising + - helm|blimp:inchoative + - helm|blimp:intransitive + - helm|blimp:irregular_past_participle_adjectives + - helm|blimp:irregular_past_participle_verbs + - helm|blimp:irregular_plural_subject_verb_agreement_1 + - helm|blimp:irregular_plural_subject_verb_agreement_2 + - helm|blimp:left_branch_island_echo_question + - helm|blimp:left_branch_island_simple_question + - helm|blimp:matrix_question_npi_licensor_present + - helm|blimp:npi_present_1 + - helm|blimp:npi_present_2 + - helm|blimp:only_npi_licensor_present + - helm|blimp:only_npi_scope + - helm|blimp:passive_1 + - helm|blimp:passive_2 + - helm|blimp:principle_A_c_command + - helm|blimp:principle_A_case_1 + - helm|blimp:principle_A_case_2 + - helm|blimp:principle_A_domain_1 + - helm|blimp:principle_A_domain_2 + - helm|blimp:principle_A_domain_3 + - helm|blimp:principle_A_reconstruction + - helm|blimp:regular_plural_subject_verb_agreement_1 + - helm|blimp:regular_plural_subject_verb_agreement_2 + - helm|blimp:sentential_negation_npi_licensor_present + - helm|blimp:sentential_negation_npi_scope + - helm|blimp:sentential_subject_island + - helm|blimp:superlative_quantifiers_1 + - helm|blimp:superlative_quantifiers_2 + - helm|blimp:tough_vs_raising_1 + - helm|blimp:tough_vs_raising_2 + - helm|blimp:transitive + - helm|blimp:wh_island + - helm|blimp:wh_questions_object_gap + - helm|blimp:wh_questions_subject_gap + - helm|blimp:wh_questions_subject_gap_long_distance + - helm|blimp:wh_vs_that_no_gap + - helm|blimp:wh_vs_that_no_gap_long_distance + - helm|blimp:wh_vs_that_with_gap + - helm|blimp:wh_vs_that_with_gap_long_distance + - helm|bold + - helm|bold:gender + - helm|bold:political_ideology + - helm|bold:profession + - helm|bold:race + - helm|bold:religious_ideology + - helm|boolq + - helm|boolq:contrastset + - helm|civil_comments + - helm|civil_comments:LGBTQ + - helm|civil_comments:black + - helm|civil_comments:christian + - helm|civil_comments:female + - helm|civil_comments:male + - helm|civil_comments:muslim + - helm|civil_comments:other_religions + - helm|civil_comments:white + - helm|commonsenseqa + - helm|copyright:n_books_1000-extractions_per_book_1-prefix_length_125 + - helm|copyright:n_books_1000-extractions_per_book_1-prefix_length_25 + - helm|copyright:n_books_1000-extractions_per_book_1-prefix_length_5 + - helm|copyright:n_books_1000-extractions_per_book_3-prefix_length_125 + - helm|copyright:n_books_1000-extractions_per_book_3-prefix_length_25 + - helm|copyright:n_books_1000-extractions_per_book_3-prefix_length_5 + - helm|copyright:oh_the_places + - helm|copyright:pilot + - helm|copyright:popular_books-prefix_length_10 + - helm|copyright:popular_books-prefix_length_125 + - helm|copyright:popular_books-prefix_length_25 + - helm|copyright:popular_books-prefix_length_250 + - helm|copyright:popular_books-prefix_length_5 + - helm|copyright:popular_books-prefix_length_50 + - helm|copyright:prompt_num_line_1-min_lines_20 + - helm|copyright:prompt_num_line_10-min_lines_20 + - helm|copyright:prompt_num_line_5-min_lines_20 + - helm|covid_dialogue + - helm|dyck_language:2 + - helm|dyck_language:3 + - helm|dyck_language:4 + - helm|entity_data_imputation:Buy + - helm|entity_data_imputation:Restaurant + - helm|entity_matching:Abt_Buy + - helm|entity_matching:Amazon_Google + - helm|entity_matching:Beer + - helm|entity_matching:Company + - helm|entity_matching:DBLP_ACM + - helm|entity_matching:DBLP_GoogleScholar + - helm|entity_matching:Dirty_DBLP_ACM + - helm|entity_matching:Dirty_DBLP_GoogleScholar + - helm|entity_matching:Dirty_Walmart_Amazon + - helm|entity_matching:Dirty_iTunes_Amazon + - helm|entity_matching:Walmart_Amazon + - helm|entity_matching:iTunes_Amazon + - helm|entity_matching=Fodors_Zagats + - helm|hellaswag + - helm|imdb + - helm|imdb:contrastset + - helm|interactive_qa_mmlu:abstract_algebra + - helm|interactive_qa_mmlu:college_chemistry + - helm|interactive_qa_mmlu:global_facts + - helm|interactive_qa_mmlu:miscellaneous + - helm|interactive_qa_mmlu:nutrition + - helm|interactive_qa_mmlu:us_foreign_policy + - helm|legal_summarization:billsum + - helm|legal_summarization:eurlexsum + - helm|legal_summarization:multilexsum + - helm|legalsupport + - helm|lexglue:case_hold + - helm|lexglue:ecthr_a + - helm|lexglue:ecthr_b + - helm|lexglue:eurlex + - helm|lexglue:ledgar + - helm|lexglue:scotus + - helm|lexglue:unfair_tos + - helm|lextreme:brazilian_court_decisions_judgment + - helm|lextreme:brazilian_court_decisions_unanimity + - helm|lextreme:covid19_emergency_event + - helm|lextreme:german_argument_mining + - helm|lextreme:greek_legal_code_chapter + - helm|lextreme:greek_legal_code_subject + - helm|lextreme:greek_legal_code_volume + - helm|lextreme:greek_legal_ner + - helm|lextreme:legalnero + - helm|lextreme:lener_br + - helm|lextreme:mapa_coarse + - helm|lextreme:mapa_fine + - helm|lextreme:multi_eurlex_level_1 + - helm|lextreme:multi_eurlex_level_2 + - helm|lextreme:multi_eurlex_level_3 + - helm|lextreme:online_terms_of_service_clause_topics + - helm|lextreme:online_terms_of_service_unfairness_levels + - helm|lextreme:swiss_judgment_prediction + - helm|lsat_qa + - helm|lsat_qa:assignment + - helm|lsat_qa:grouping + - helm|lsat_qa:miscellaneous + - helm|lsat_qa:ordering + - helm|me_q_sum + - helm|med_dialog:healthcaremagic + - helm|med_dialog:icliniq + - helm|med_mcqa + - helm|med_paragraph_simplification + - helm|med_qa + - helm|mmlu + - helm|mmlu:abstract_algebra + - helm|mmlu:anatomy + - helm|mmlu:astronomy + - helm|mmlu:business_ethics + - helm|mmlu:clinical_knowledge + - helm|mmlu:college_biology + - helm|mmlu:college_chemistry + - helm|mmlu:college_computer_science + - helm|mmlu:college_mathematics + - helm|mmlu:college_medicine + - helm|mmlu:college_physics + - helm|mmlu:computer_security + - helm|mmlu:conceptual_physics + - helm|mmlu:econometrics + - helm|mmlu:electrical_engineering + - helm|mmlu:elementary_mathematics + - helm|mmlu:formal_logic + - helm|mmlu:global_facts + - helm|mmlu:high_school_biology + - helm|mmlu:high_school_chemistry + - helm|mmlu:high_school_computer_science + - helm|mmlu:high_school_european_history + - helm|mmlu:high_school_geography + - helm|mmlu:high_school_government_and_politics + - helm|mmlu:high_school_macroeconomics + - helm|mmlu:high_school_mathematics + - helm|mmlu:high_school_microeconomics + - helm|mmlu:high_school_physics + - helm|mmlu:high_school_psychology + - helm|mmlu:high_school_statistics + - helm|mmlu:high_school_us_history + - helm|mmlu:high_school_world_history + - helm|mmlu:human_aging + - helm|mmlu:human_sexuality + - helm|mmlu:international_law + - helm|mmlu:jurisprudence + - helm|mmlu:logical_fallacies + - helm|mmlu:machine_learning + - helm|mmlu:management + - helm|mmlu:marketing + - helm|mmlu:medical_genetics + - helm|mmlu:miscellaneous + - helm|mmlu:moral_disputes + - helm|mmlu:moral_scenarios + - helm|mmlu:nutrition + - helm|mmlu:philosophy + - helm|mmlu:prehistory + - helm|mmlu:professional_accounting + - helm|mmlu:professional_law + - helm|mmlu:professional_medicine + - helm|mmlu:professional_psychology + - helm|mmlu:public_relations + - helm|mmlu:security_studies + - helm|mmlu:sociology + - helm|mmlu:us_foreign_policy + - helm|mmlu:virology + - helm|mmlu:world_religions + - helm|narrativeqa + - helm|numeracy:linear_example + - helm|numeracy:linear_standard + - helm|numeracy:parabola_example + - helm|numeracy:parabola_standard + - helm|numeracy:paraboloid_example + - helm|numeracy:paraboloid_standard + - helm|numeracy:plane_example + - helm|numeracy:plane_standard + - helm|openbookqa + - helm|piqa + - helm|pubmedqa + - helm|quac + - helm|raft:ade_corpus_v2 + - helm|raft:banking_77 + - helm|raft:neurips_impact_statement_risks + - helm|raft:one_stop_english + - helm|raft:overruling + - helm|raft:semiconductor_org_types + - helm|raft:systematic_review_inclusion + - helm|raft:tai_safety_research + - helm|raft:terms_of_service + - helm|raft:tweet_eval_hate + - helm|raft:twitter_complaints + - helm|real_toxicity_prompts + - helm|siqa + - helm|summarization:cnn-dm + - helm|summarization:xsum + - helm|summarization:xsum-sampled + - helm|synthetic_reasoning:induction + - helm|synthetic_reasoning:natural_easy + - helm|synthetic_reasoning:natural_hard + - helm|synthetic_reasoning:pattern_match + - helm|synthetic_reasoning:variable_substitution + - helm|the_pile:arxiv + - helm|the_pile:bibliotik + - helm|the_pile:commoncrawl + - helm|the_pile:dm-mathematics + - helm|the_pile:enron + - helm|the_pile:europarl + - helm|the_pile:freelaw + - helm|the_pile:github + - helm|the_pile:gutenberg + - helm|the_pile:hackernews + - helm|the_pile:nih-exporter + - helm|the_pile:opensubtitles + - helm|the_pile:openwebtext2 + - helm|the_pile:pubmed-abstracts + - helm|the_pile:pubmed-central + - helm|the_pile:stackexchange + - helm|the_pile:upsto + - helm|the_pile:wikipedia + - helm|the_pile:youtubesubtitles + - helm|truthfulqa + - helm|twitterAAE:aa + - helm|twitterAAE:white + - helm|wikifact:applies_to_jurisdiction + - helm|wikifact:atomic_number + - helm|wikifact:author + - helm|wikifact:award_received + - helm|wikifact:basic_form_of_government + - helm|wikifact:capital + - helm|wikifact:capital_of + - helm|wikifact:central_bank + - helm|wikifact:composer + - helm|wikifact:continent + - helm|wikifact:country + - helm|wikifact:country_of_citizenship + - helm|wikifact:country_of_origin + - helm|wikifact:creator + - helm|wikifact:currency + - helm|wikifact:defendant + - helm|wikifact:developer + - helm|wikifact:diplomatic_relation + - helm|wikifact:director + - helm|wikifact:discoverer_or_inventor + - helm|wikifact:drug_or_therapy_used_for_treatment + - helm|wikifact:educated_at + - helm|wikifact:electron_configuration + - helm|wikifact:employer + - helm|wikifact:field_of_work + - helm|wikifact:file_extension + - helm|wikifact:genetic_association + - helm|wikifact:genre + - helm|wikifact:has_part + - helm|wikifact:head_of_government + - helm|wikifact:head_of_state + - helm|wikifact:headquarters_location + - helm|wikifact:industry + - helm|wikifact:influenced_by + - helm|wikifact:instance_of + - helm|wikifact:instrument + - helm|wikifact:language_of_work_or_name + - helm|wikifact:languages_spoken_written_or_signed + - helm|wikifact:laws_applied + - helm|wikifact:located_in_the_administrative_territorial_entity + - helm|wikifact:location + - helm|wikifact:location_of_discovery + - helm|wikifact:location_of_formation + - helm|wikifact:majority_opinion_by + - helm|wikifact:manufacturer + - helm|wikifact:measured_physical_quantity + - helm|wikifact:medical_condition_treated + - helm|wikifact:member_of + - helm|wikifact:member_of_political_party + - helm|wikifact:member_of_sports_team + - helm|wikifact:movement + - helm|wikifact:named_after + - helm|wikifact:native_language + - helm|wikifact:number_of_processor_cores + - helm|wikifact:occupation + - helm|wikifact:office_held_by_head_of_government + - helm|wikifact:office_held_by_head_of_state + - helm|wikifact:official_language + - helm|wikifact:operating_system + - helm|wikifact:original_language_of_film_or_TV_show + - helm|wikifact:original_network + - helm|wikifact:overrules + - helm|wikifact:owned_by + - helm|wikifact:part_of + - helm|wikifact:participating_team + - helm|wikifact:place_of_birth + - helm|wikifact:place_of_death + - helm|wikifact:plaintiff + - helm|wikifact:position_held + - helm|wikifact:position_played_on_team + - helm|wikifact:programming_language + - helm|wikifact:recommended_unit_of_measurement + - helm|wikifact:record_label + - helm|wikifact:religion + - helm|wikifact:repealed_by + - helm|wikifact:shares_border_with + - helm|wikifact:solved_by + - helm|wikifact:statement_describes + - helm|wikifact:stock_exchange + - helm|wikifact:subclass_of + - helm|wikifact:subsidiary + - helm|wikifact:symptoms_and_signs + - helm|wikifact:therapeutic_area + - helm|wikifact:time_of_discovery_or_invention + - helm|wikifact:twinned_administrative_body + - helm|wikifact:work_location + - helm|wikitext:103:document_level + - helm|wmt14:cs-en + - helm|wmt14:de-en + - helm|wmt14:fr-en + - helm|wmt14:hi-en + - helm|wmt14:ru-en + +- leaderboard: + - leaderboard|arc:challenge + - leaderboard|gsm8k + - leaderboard|hellaswag + - leaderboard|mmlu:abstract_algebra + - leaderboard|mmlu:anatomy + - leaderboard|mmlu:astronomy + - leaderboard|mmlu:business_ethics + - leaderboard|mmlu:clinical_knowledge + - leaderboard|mmlu:college_biology + - leaderboard|mmlu:college_chemistry + - leaderboard|mmlu:college_computer_science + - leaderboard|mmlu:college_mathematics + - leaderboard|mmlu:college_medicine + - leaderboard|mmlu:college_physics + - leaderboard|mmlu:computer_security + - leaderboard|mmlu:conceptual_physics + - leaderboard|mmlu:econometrics + - leaderboard|mmlu:electrical_engineering + - leaderboard|mmlu:elementary_mathematics + - leaderboard|mmlu:formal_logic + - leaderboard|mmlu:global_facts + - leaderboard|mmlu:high_school_biology + - leaderboard|mmlu:high_school_chemistry + - leaderboard|mmlu:high_school_computer_science + - leaderboard|mmlu:high_school_european_history + - leaderboard|mmlu:high_school_geography + - leaderboard|mmlu:high_school_government_and_politics + - leaderboard|mmlu:high_school_macroeconomics + - leaderboard|mmlu:high_school_mathematics + - leaderboard|mmlu:high_school_microeconomics + - leaderboard|mmlu:high_school_physics + - leaderboard|mmlu:high_school_psychology + - leaderboard|mmlu:high_school_statistics + - leaderboard|mmlu:high_school_us_history + - leaderboard|mmlu:high_school_world_history + - leaderboard|mmlu:human_aging + - leaderboard|mmlu:human_sexuality + - leaderboard|mmlu:international_law + - leaderboard|mmlu:jurisprudence + - leaderboard|mmlu:logical_fallacies + - leaderboard|mmlu:machine_learning + - leaderboard|mmlu:management + - leaderboard|mmlu:marketing + - leaderboard|mmlu:medical_genetics + - leaderboard|mmlu:miscellaneous + - leaderboard|mmlu:moral_disputes + - leaderboard|mmlu:moral_scenarios + - leaderboard|mmlu:nutrition + - leaderboard|mmlu:philosophy + - leaderboard|mmlu:prehistory + - leaderboard|mmlu:professional_accounting + - leaderboard|mmlu:professional_law + - leaderboard|mmlu:professional_medicine + - leaderboard|mmlu:professional_psychology + - leaderboard|mmlu:public_relations + - leaderboard|mmlu:security_studies + - leaderboard|mmlu:sociology + - leaderboard|mmlu:us_foreign_policy + - leaderboard|mmlu:virology + - leaderboard|mmlu:world_religions + - leaderboard|truthfulqa:mc + - leaderboard|winogrande + +- lighteval: + - lighteval|agieval:aqua-rat + - lighteval|agieval:gaokao-biology + - lighteval|agieval:gaokao-chemistry + - lighteval|agieval:gaokao-chinese + - lighteval|agieval:gaokao-english + - lighteval|agieval:gaokao-geography + - lighteval|agieval:gaokao-history + - lighteval|agieval:gaokao-mathqa + - lighteval|agieval:gaokao-physics + - lighteval|agieval:logiqa-en + - lighteval|agieval:logiqa-zh + - lighteval|agieval:lsat-ar + - lighteval|agieval:lsat-lr + - lighteval|agieval:lsat-rc + - lighteval|agieval:sat-en + - lighteval|agieval:sat-en-without-passage + - lighteval|agieval:sat-math + - lighteval|anli + - lighteval|anli:r1 + - lighteval|anli:r2 + - lighteval|anli:r3 + - lighteval|arc:easy + - lighteval|arithmetic:1dc + - lighteval|arithmetic:2da + - lighteval|arithmetic:2dm + - lighteval|arithmetic:2ds + - lighteval|arithmetic:3da + - lighteval|arithmetic:3ds + - lighteval|arithmetic:4da + - lighteval|arithmetic:4ds + - lighteval|arithmetic:5da + - lighteval|arithmetic:5ds + - lighteval|asdiv + - lighteval|bigbench:causal_judgment + - lighteval|bigbench:date_understanding + - lighteval|bigbench:disambiguation_qa + - lighteval|bigbench:geometric_shapes + - lighteval|bigbench:logical_deduction_five_objects + - lighteval|bigbench:logical_deduction_seven_objects + - lighteval|bigbench:logical_deduction_three_objects + - lighteval|bigbench:movie_recommendation + - lighteval|bigbench:navigate + - lighteval|bigbench:reasoning_about_colored_objects + - lighteval|bigbench:ruin_names + - lighteval|bigbench:salient_translation_error_detection + - lighteval|bigbench:snarks + - lighteval|bigbench:sports_understanding + - lighteval|bigbench:temporal_sequences + - lighteval|bigbench:tracking_shuffled_objects_five_objects + - lighteval|bigbench:tracking_shuffled_objects_seven_objects + - lighteval|bigbench:tracking_shuffled_objects_three_objects + - lighteval|blimp:adjunct_island + - lighteval|blimp:anaphor_gender_agreement + - lighteval|blimp:anaphor_number_agreement + - lighteval|blimp:animate_subject_passive + - lighteval|blimp:animate_subject_trans + - lighteval|blimp:causative + - lighteval|blimp:complex_NP_island + - lighteval|blimp:coordinate_structure_constraint_complex_left_branch + - lighteval|blimp:coordinate_structure_constraint_object_extraction + - lighteval|blimp:determiner_noun_agreement_1 + - lighteval|blimp:determiner_noun_agreement_2 + - lighteval|blimp:determiner_noun_agreement_irregular_1 + - lighteval|blimp:determiner_noun_agreement_irregular_2 + - lighteval|blimp:determiner_noun_agreement_with_adj_2 + - lighteval|blimp:determiner_noun_agreement_with_adj_irregular_1 + - lighteval|blimp:determiner_noun_agreement_with_adj_irregular_2 + - lighteval|blimp:determiner_noun_agreement_with_adjective_1 + - lighteval|blimp:distractor_agreement_relational_noun + - lighteval|blimp:distractor_agreement_relative_clause + - lighteval|blimp:drop_argument + - lighteval|blimp:ellipsis_n_bar_1 + - lighteval|blimp:ellipsis_n_bar_2 + - lighteval|blimp:existential_there_object_raising + - lighteval|blimp:existential_there_quantifiers_1 + - lighteval|blimp:existential_there_quantifiers_2 + - lighteval|blimp:existential_there_subject_raising + - lighteval|blimp:expletive_it_object_raising + - lighteval|blimp:inchoative + - lighteval|blimp:intransitive + - lighteval|blimp:irregular_past_participle_adjectives + - lighteval|blimp:irregular_past_participle_verbs + - lighteval|blimp:irregular_plural_subject_verb_agreement_1 + - lighteval|blimp:irregular_plural_subject_verb_agreement_2 + - lighteval|blimp:left_branch_island_echo_question + - lighteval|blimp:left_branch_island_simple_question + - lighteval|blimp:matrix_question_npi_licensor_present + - lighteval|blimp:npi_present_1 + - lighteval|blimp:npi_present_2 + - lighteval|blimp:only_npi_licensor_present + - lighteval|blimp:only_npi_scope + - lighteval|blimp:passive_1 + - lighteval|blimp:passive_2 + - lighteval|blimp:principle_A_c_command + - lighteval|blimp:principle_A_case_1 + - lighteval|blimp:principle_A_case_2 + - lighteval|blimp:principle_A_domain_1 + - lighteval|blimp:principle_A_domain_2 + - lighteval|blimp:principle_A_domain_3 + - lighteval|blimp:principle_A_reconstruction + - lighteval|blimp:regular_plural_subject_verb_agreement_1 + - lighteval|blimp:regular_plural_subject_verb_agreement_2 + - lighteval|blimp:sentential_negation_npi_licensor_present + - lighteval|blimp:sentential_negation_npi_scope + - lighteval|blimp:sentential_subject_island + - lighteval|blimp:superlative_quantifiers_1 + - lighteval|blimp:superlative_quantifiers_2 + - lighteval|blimp:tough_vs_raising_1 + - lighteval|blimp:tough_vs_raising_2 + - lighteval|blimp:transitive + - lighteval|blimp:wh_island + - lighteval|blimp:wh_questions_object_gap + - lighteval|blimp:wh_questions_subject_gap + - lighteval|blimp:wh_questions_subject_gap_long_distance + - lighteval|blimp:wh_vs_that_no_gap + - lighteval|blimp:wh_vs_that_no_gap_long_distance + - lighteval|blimp:wh_vs_that_with_gap + - lighteval|blimp:wh_vs_that_with_gap_long_distance + - lighteval|coqa + - lighteval|coqa_bb + - lighteval|drop + - lighteval|ethics:commonsense + - lighteval|ethics:deontology + - lighteval|ethics:justice + - lighteval|ethics:utilitarianism + - lighteval|ethics:virtue + - lighteval|glue:cola + - lighteval|glue:mnli + - lighteval|glue:mnli_mismatched + - lighteval|glue:mrpc + - lighteval|glue:qnli + - lighteval|glue:qqp + - lighteval|glue:rte + - lighteval|glue:sst2 + - lighteval|glue:stsb + - lighteval|glue:wnli + - lighteval|gpqa + - lighteval|gsm8k + - lighteval|headqa:en + - lighteval|headqa:es + - lighteval|iwslt17:ar-en + - lighteval|iwslt17:de-en + - lighteval|iwslt17:en-ar + - lighteval|iwslt17:en-de + - lighteval|iwslt17:en-fr + - lighteval|iwslt17:en-ja + - lighteval|iwslt17:en-ko + - lighteval|iwslt17:en-zh + - lighteval|iwslt17:fr-en + - lighteval|iwslt17:ja-en + - lighteval|iwslt17:ko-en + - lighteval|iwslt17:zh-en + - lighteval|lambada:openai + - lighteval|lambada:openai:de + - lighteval|lambada:openai:en + - lighteval|lambada:openai:es + - lighteval|lambada:openai:fr + - lighteval|lambada:openai:it + - lighteval|lambada:openai_cloze + - lighteval|lambada:standard + - lighteval|lambada:standard_cloze + - lighteval|logiqa + - lighteval|math:algebra + - lighteval|math:counting_and_probability + - lighteval|math:geometry + - lighteval|math:intermediate_algebra + - lighteval|math:number_theory + - lighteval|math:prealgebra + - lighteval|math:precalculus + - lighteval|math_cot:algebra + - lighteval|math_cot:counting_and_probability + - lighteval|math_cot:geometry + - lighteval|math_cot:intermediate_algebra + - lighteval|math_cot:number_theory + - lighteval|math_cot:prealgebra + - lighteval|math_cot:precalculus + - lighteval|mathqa + - lighteval|mgsm:bn + - lighteval|mgsm:de + - lighteval|mgsm:en + - lighteval|mgsm:es + - lighteval|mgsm:fr + - lighteval|mgsm:ja + - lighteval|mgsm:ru + - lighteval|mgsm:sw + - lighteval|mgsm:te + - lighteval|mgsm:th + - lighteval|mgsm:zh + - lighteval|mtnt2019:en-fr + - lighteval|mtnt2019:en-ja + - lighteval|mtnt2019:fr-en + - lighteval|mtnt2019:ja-en + - lighteval|mutual + - lighteval|mutual_plus + - lighteval|openbookqa + - lighteval|piqa + - lighteval|prost + - lighteval|pubmedqa + - lighteval|qa4mre:2011 + - lighteval|qa4mre:2012 + - lighteval|qa4mre:2013 + - lighteval|qasper + - lighteval|qasper_ll + - lighteval|race:high + - lighteval|sciq + - lighteval|storycloze:2016 + - lighteval|storycloze:2018 + - lighteval|super_glue:boolq + - lighteval|super_glue:cb + - lighteval|super_glue:copa + - lighteval|super_glue:multirc + - lighteval|super_glue:rte + - lighteval|super_glue:wic + - lighteval|super_glue:wsc + - lighteval|swag + - lighteval|the_pile:arxiv + - lighteval|the_pile:bookcorpus2 + - lighteval|the_pile:books3 + - lighteval|the_pile:dm-mathematics + - lighteval|the_pile:enron + - lighteval|the_pile:europarl + - lighteval|the_pile:freelaw + - lighteval|the_pile:github + - lighteval|the_pile:gutenberg + - lighteval|the_pile:hackernews + - lighteval|the_pile:nih-exporter + - lighteval|the_pile:opensubtitles + - lighteval|the_pile:openwebtext2 + - lighteval|the_pile:philpapers + - lighteval|the_pile:pile-cc + - lighteval|the_pile:pubmed-abstracts + - lighteval|the_pile:pubmed-central + - lighteval|the_pile:stackexchange + - lighteval|the_pile:ubuntu-irc + - lighteval|the_pile:uspto + - lighteval|the_pile:wikipedia + - lighteval|the_pile:youtubesubtitles + - lighteval|toxigen + - lighteval|triviaqa + - lighteval|truthfulqa:gen + - lighteval|unscramble:anagrams1 + - lighteval|unscramble:anagrams2 + - lighteval|unscramble:cycle_letters + - lighteval|unscramble:random_insertion + - lighteval|unscramble:reversed_words + - lighteval|webqs + - lighteval|wikitext:2 + - lighteval|wmt08:cs-en + - lighteval|wmt08:de-en + - lighteval|wmt08:en-cs + - lighteval|wmt08:en-de + - lighteval|wmt08:en-es + - lighteval|wmt08:en-fr + - lighteval|wmt08:en-hu + - lighteval|wmt08:es-en + - lighteval|wmt08:fr-en + - lighteval|wmt08:hu-en + - lighteval|wmt09:cs-en + - lighteval|wmt09:de-en + - lighteval|wmt09:en-cs + - lighteval|wmt09:en-de + - lighteval|wmt09:en-es + - lighteval|wmt09:en-fr + - lighteval|wmt09:en-hu + - lighteval|wmt09:en-it + - lighteval|wmt09:es-en + - lighteval|wmt09:fr-en + - lighteval|wmt09:hu-en + - lighteval|wmt09:it-en + - lighteval|wmt10:cs-en + - lighteval|wmt10:de-en + - lighteval|wmt10:en-cs + - lighteval|wmt10:en-de + - lighteval|wmt10:en-es + - lighteval|wmt10:en-fr + - lighteval|wmt10:es-en + - lighteval|wmt10:fr-en + - lighteval|wmt11:cs-en + - lighteval|wmt11:de-en + - lighteval|wmt11:en-cs + - lighteval|wmt11:en-de + - lighteval|wmt11:en-es + - lighteval|wmt11:en-fr + - lighteval|wmt11:es-en + - lighteval|wmt11:fr-en + - lighteval|wmt12:cs-en + - lighteval|wmt12:de-en + - lighteval|wmt12:en-cs + - lighteval|wmt12:en-de + - lighteval|wmt12:en-es + - lighteval|wmt12:en-fr + - lighteval|wmt12:es-en + - lighteval|wmt12:fr-en + - lighteval|wmt13:cs-en + - lighteval|wmt13:de-en + - lighteval|wmt13:en-cs + - lighteval|wmt13:en-de + - lighteval|wmt13:en-es + - lighteval|wmt13:en-fr + - lighteval|wmt13:en-ru + - lighteval|wmt13:es-en + - lighteval|wmt13:fr-en + - lighteval|wmt13:ru-en + - lighteval|wmt14:cs-en + - lighteval|wmt14:de-en + - lighteval|wmt14:en-cs + - lighteval|wmt14:en-de + - lighteval|wmt14:en-fr + - lighteval|wmt14:en-hi + - lighteval|wmt14:en-ru + - lighteval|wmt14:fr-en + - lighteval|wmt14:hi-en + - lighteval|wmt14:ru-en + - lighteval|wmt15:cs-en + - lighteval|wmt15:de-en + - lighteval|wmt15:en-cs + - lighteval|wmt15:en-de + - lighteval|wmt15:en-fi + - lighteval|wmt15:en-fr + - lighteval|wmt15:en-ru + - lighteval|wmt15:fi-en + - lighteval|wmt15:fr-en + - lighteval|wmt15:ru-en + - lighteval|wmt16:cs-en + - lighteval|wmt16:de-en + - lighteval|wmt16:en-cs + - lighteval|wmt16:en-de + - lighteval|wmt16:en-fi + - lighteval|wmt16:en-ro + - lighteval|wmt16:en-ru + - lighteval|wmt16:en-tr + - lighteval|wmt16:fi-en + - lighteval|wmt16:ro-en + - lighteval|wmt16:ru-en + - lighteval|wmt16:tr-en + - lighteval|wmt17:cs-en + - lighteval|wmt17:de-en + - lighteval|wmt17:en-cs + - lighteval|wmt17:en-de + - lighteval|wmt17:en-fi + - lighteval|wmt17:en-lv + - lighteval|wmt17:en-ru + - lighteval|wmt17:en-tr + - lighteval|wmt17:en-zh + - lighteval|wmt17:fi-en + - lighteval|wmt17:lv-en + - lighteval|wmt17:ru-en + - lighteval|wmt17:tr-en + - lighteval|wmt17:zh-en + - lighteval|wmt18:cs-en + - lighteval|wmt18:de-en + - lighteval|wmt18:en-cs + - lighteval|wmt18:en-de + - lighteval|wmt18:en-et + - lighteval|wmt18:en-fi + - lighteval|wmt18:en-ru + - lighteval|wmt18:en-tr + - lighteval|wmt18:en-zh + - lighteval|wmt18:et-en + - lighteval|wmt18:fi-en + - lighteval|wmt18:ru-en + - lighteval|wmt18:tr-en + - lighteval|wmt18:zh-en + - lighteval|wmt19:cs-de + - lighteval|wmt19:de-cs + - lighteval|wmt19:de-en + - lighteval|wmt19:de-fr + - lighteval|wmt19:en-cs + - lighteval|wmt19:en-de + - lighteval|wmt19:en-fi + - lighteval|wmt19:en-gu + - lighteval|wmt19:en-kk + - lighteval|wmt19:en-lt + - lighteval|wmt19:en-ru + - lighteval|wmt19:en-zh + - lighteval|wmt19:fi-en + - lighteval|wmt19:fr-de + - lighteval|wmt19:gu-en + - lighteval|wmt19:kk-en + - lighteval|wmt19:lt-en + - lighteval|wmt19:ru-en + - lighteval|wmt19:zh-en + - lighteval|wmt20:cs-en + - lighteval|wmt20:de-en + - lighteval|wmt20:de-fr + - lighteval|wmt20:en-cs + - lighteval|wmt20:en-de + - lighteval|wmt20:en-iu + - lighteval|wmt20:en-ja + - lighteval|wmt20:en-km + - lighteval|wmt20:en-pl + - lighteval|wmt20:en-ps + - lighteval|wmt20:en-ru + - lighteval|wmt20:en-ta + - lighteval|wmt20:en-zh + - lighteval|wmt20:fr-de + - lighteval|wmt20:iu-en + - lighteval|wmt20:ja-en + - lighteval|wmt20:km-en + - lighteval|wmt20:pl-en + - lighteval|wmt20:ps-en + - lighteval|wmt20:ru-en + - lighteval|wmt20:ta-en + - lighteval|wmt20:zh-en + - lighteval|wsc273 + - lighteval|xcopa:en + - lighteval|xcopa:et + - lighteval|xcopa:ht + - lighteval|xcopa:id + - lighteval|xcopa:it + - lighteval|xcopa:qu + - lighteval|xcopa:sw + - lighteval|xcopa:ta + - lighteval|xcopa:th + - lighteval|xcopa:tr + - lighteval|xcopa:vi + - lighteval|xcopa:zh + - lighteval|xstory_cloze:ar + - lighteval|xstory_cloze:en + - lighteval|xstory_cloze:es + - lighteval|xstory_cloze:eu + - lighteval|xstory_cloze:hi + - lighteval|xstory_cloze:id + - lighteval|xstory_cloze:my + - lighteval|xstory_cloze:ru + - lighteval|xstory_cloze:sw + - lighteval|xstory_cloze:te + - lighteval|xstory_cloze:zh + - lighteval|xwinograd:en + - lighteval|xwinograd:fr + - lighteval|xwinograd:jp + - lighteval|xwinograd:pt + - lighteval|xwinograd:ru + - lighteval|xwinograd:zh + +- original: + - original|arc:c:letters + - original|arc:c:options + - original|arc:c:simple + - original|mmlu + - original|mmlu:abstract_algebra + - original|mmlu:anatomy + - original|mmlu:astronomy + - original|mmlu:business_ethics + - original|mmlu:clinical_knowledge + - original|mmlu:college_biology + - original|mmlu:college_chemistry + - original|mmlu:college_computer_science + - original|mmlu:college_mathematics + - original|mmlu:college_medicine + - original|mmlu:college_physics + - original|mmlu:computer_security + - original|mmlu:conceptual_physics + - original|mmlu:econometrics + - original|mmlu:electrical_engineering + - original|mmlu:elementary_mathematics + - original|mmlu:formal_logic + - original|mmlu:global_facts + - original|mmlu:high_school_biology + - original|mmlu:high_school_chemistry + - original|mmlu:high_school_computer_science + - original|mmlu:high_school_european_history + - original|mmlu:high_school_geography + - original|mmlu:high_school_government_and_politics + - original|mmlu:high_school_macroeconomics + - original|mmlu:high_school_mathematics + - original|mmlu:high_school_microeconomics + - original|mmlu:high_school_physics + - original|mmlu:high_school_psychology + - original|mmlu:high_school_statistics + - original|mmlu:high_school_us_history + - original|mmlu:high_school_world_history + - original|mmlu:human_aging + - original|mmlu:human_sexuality + - original|mmlu:international_law + - original|mmlu:jurisprudence + - original|mmlu:logical_fallacies + - original|mmlu:machine_learning + - original|mmlu:management + - original|mmlu:marketing + - original|mmlu:medical_genetics + - original|mmlu:miscellaneous + - original|mmlu:moral_disputes + - original|mmlu:moral_scenarios + - original|mmlu:nutrition + - original|mmlu:philosophy + - original|mmlu:prehistory + - original|mmlu:professional_accounting + - original|mmlu:professional_law + - original|mmlu:professional_medicine + - original|mmlu:professional_psychology + - original|mmlu:public_relations + - original|mmlu:security_studies + - original|mmlu:sociology + - original|mmlu:us_foreign_policy + - original|mmlu:virology + - original|mmlu:world_religions diff --git a/docs/source/contributing-to-multilingual-evaluations.mdx b/docs/source/contributing-to-multilingual-evaluations.mdx new file mode 100644 index 000000000..25779bc38 --- /dev/null +++ b/docs/source/contributing-to-multilingual-evaluations.mdx @@ -0,0 +1,107 @@ +# Contributing to multilingual evaluations + +## Contributing a small translation + +We define 19 `literals`, basic keywords or punctuation signs used when creating evaluation prompts in an automatic manner, such as `yes`, `no`, `because`, etc. + +We welcome translations in your language! + +To contribute, you'll need to +1. Open the [translation_literals](https://github.com/huggingface/lighteval/blob/main/src/lighteval/tasks/templates/utils/translation_literals.py) file +2. Edit the file to add or expand the literal for your language of interest. + +```python + Language.ENGLISH: TranslationLiterals( + language=Language.ENGLISH, + question_word="question", # Usage: "Question: How are you?" + answer="answer", # Usage: "Answer: I am fine" + confirmation_word="right", # Usage: "He is smart, right?" + yes="yes", # Usage: "Yes, he is" + no="no", # Usage: "No, he is not" + also="also", # Usage: "Also, she is smart." + cause_word="because", # Usage: "She is smart, because she is tall" + effect_word="therefore", # Usage: "He is tall therefore he is smart" + or_word="or", # Usage: "He is tall or small" + true="true", # Usage: "He is smart, true, false or neither?" + false="false", # Usage: "He is smart, true, false or neither?" + neither="neither", # Usage: "He is smart, true, false or neither?" + # Punctuation and spacing: only adjust if your language uses something different than in English + full_stop=".", + comma=",", + question_mark="?", + exclamation_mark="!", + word_space=" ", + sentence_space=" ", + colon=":", + # The first characters of your alphabet used in enumerations, if different from English + indices=["A", "B", "C", ...] + ) +``` + +3. Open a PR with your modifications! And voilà! + +## Contributing a new multilingual task + +You should first read our guide on [adding a custom task](adding-a-custom-task), to better understand the different parameters we use. + +Then, you should take a look at the current [multilingual tasks](https://github.com/huggingface/lighteval/blob/main/src/lighteval/tasks/multilingual/tasks.py) file, to understand how they are defined. For multilingual evaluations the `prompt_function` should be implemented by language-adapted template. The template will take care of correct formatting, correct and consistent usage of language adjusted prompt anchors (e.g Question/Answer) and punctuation. + +Browse the list of all templates [here](https://github.com/huggingface/lighteval/tree/main/src/lighteval/tasks/templates) to see which are the most adapted to your own task. + +Then, when ready, to define your own task, you should: +1. create a Python file as indicated in the above guide +2. import the relevant templates for your task type (XNLI, Copa, Multiple choice, Question Answering, etc) +3. define one or a list of tasks for each relevant language and evaluation formulation (for multichoice) using our parametrizable `LightevalTaskConfig` class + +```python +your_tasks = [ + LightevalTaskConfig( + # Name of your evaluation + name=f"evalname_{language.value}_{formulation.name.lower()}", + # The evaluation is community contributed + suite=["community"], + # This will automatically get the correct metrics for your chosen formulation + metric=get_metrics_for_formulation( + formulation, + [ + loglikelihood_acc_metric(normalization=None), + loglikelihood_acc_metric(normalization=LogProbTokenNorm()), + loglikelihood_acc_metric(normalization=LogProbCharNorm()), + ], + ), + # In this function, you choose which template to follow and for which language and formulation + prompt_function=get_template_prompt_function( + language=language, + # then use the adapter to define the mapping between the + # keys of the template (left), and the keys of your dataset + # (right) + # To know which template keys are required and available, + # consult the appropriate adapter type and doc-string. + adapter=lambda line: { + "key": line["relevant_key"], + ... + }, + formulation=formulation, + ), + # You can also add specific filters to remove irrelevant samples + hf_filter=lambda line: line["label"] in , + # You then select your huggingface dataset as well as + # the splits available for evaluation + hf_repo=, + hf_subset=, + evaluation_splits=["train"], + hf_avail_splits=["train"], + ) + for language in [ + Language.YOUR_LANGUAGE, ... + ] + for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] +] +``` +4. then, you can go back to the guide to test if your task is correctly implemented! + +> [!TIP] +> All `LightevalTaskConfig` parameters are strongly typed, including the inputs to the template function. Make sure to take advantage of your IDE's functionality to make it easier to correctly fill these parameters. + + +Once everything is good, open a PR, and we'll be happy to review it! \ No newline at end of file diff --git a/docs/source/evaluate-the-model-on-a-server-or-container.mdx b/docs/source/evaluate-the-model-on-a-server-or-container.mdx new file mode 100644 index 000000000..da8f1d4b7 --- /dev/null +++ b/docs/source/evaluate-the-model-on-a-server-or-container.mdx @@ -0,0 +1,67 @@ +# Evaluate the model on a server or container + +An alternative to launching the evaluation locally is to serve the model on a +TGI-compatible server/container and then run the evaluation by sending requests +to the server. The command is the same as before, except you specify a path to +a yaml config file (detailed below): + +```bash +lighteval accelerate \ + --model_config_path="/path/to/config/file"\ + --tasks \ + --output_dir output_dir +``` + +There are two types of configuration files that can be provided for running on +the server: + +### Hugging Face Inference Endpoints + +To launch a model using HuggingFace's Inference Endpoints, you need to provide +the following file: `endpoint_model.yaml`. Lighteval will automatically deploy +the endpoint, run the evaluation, and finally delete the endpoint (unless you +specify an endpoint that was already launched, in which case the endpoint won't +be deleted afterwards). + +__configuration file example:__ + +```yaml +model: + type: "endpoint" + base_params: + endpoint_name: "llama-2-7B-lighteval" # needs to be lower case without special characters + model: "meta-llama/Llama-2-7b-hf" + revision: "main" + dtype: "float16" # can be any of "awq", "eetq", "gptq", "4bit' or "8bit" (will use bitsandbytes), "bfloat16" or "float16" + reuse_existing: false # if true, ignore all params in instance, and don't delete the endpoint after evaluation + instance: + accelerator: "gpu" + region: "eu-west-1" + vendor: "aws" + instance_size: "medium" + instance_type: "g5.2xlarge" + framework: "pytorch" + endpoint_type: "protected" + namespace: null # The namespace under which to launch the endopint. Defaults to the current user's namespace + image_url: null # Optionally specify the docker image to use when launching the endpoint model. E.g., launching models with later releases of the TGI container with support for newer models. + env_vars: + null # Optional environment variables to include when launching the endpoint. e.g., `MAX_INPUT_LENGTH: 2048` + generation: + add_special_tokens: true +``` + +### Text Generation Inference (TGI) + +To use a model already deployed on a TGI server, for example on HuggingFace's +serverless inference. + +__configuration file example:__ + +```yaml +model: + type: "tgi" + instance: + inference_server_address: "" + inference_server_auth: null + model_id: null # Optional, only required if the TGI container was launched with model_id pointing to a local directory +``` diff --git a/docs/source/index.mdx b/docs/source/index.mdx new file mode 100644 index 000000000..9c055f5e4 --- /dev/null +++ b/docs/source/index.mdx @@ -0,0 +1,18 @@ +# Lighteval + +🤗 Lighteval is your all-in-one toolkit for evaluating LLMs across multiple +backends—whether it's +[transformers](https://github.com/huggingface/transformers), +[tgi](https://github.com/huggingface/text-generation-inference), +[vllm](https://github.com/vllm-project/vllm), or +[nanotron](https://github.com/huggingface/nanotron)—with +ease. Dive deep into your model’s performance by saving and exploring detailed, +sample-by-sample results to debug and see how your models stack-up. + +Customization at your fingertips: letting you effortlessly create [new +tasks](adding-a-custom-task) and +[metrics](adding-a-new-metric) +tailored to your needs, or browsing all our existing tasks and metrics. + +Seamlessly experiment, benchmark, and store your results on the Hugging Face +Hub, S3, or locally. diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx new file mode 100644 index 000000000..39ac2b897 --- /dev/null +++ b/docs/source/installation.mdx @@ -0,0 +1,46 @@ +# Installation + +You can install Lighteval either from PyPi or from source. + +## From PyPi + +```bash +pip install lighteval +``` + +## From source +Source install is mostly for people who intend to develop on `lighteval` + +```bash +git clone https://github.com/huggingface/lighteval.git +cd lighteval +pip install -e . +``` + +## Extras + +Lighteval has optional dependencies that you can install by specifying the +appropriate extras group. +`pip install lighteval[]` or `pip install -e .[]`. + +| extra name | description | +|--------------|---------------------------------------------------------------------------| +| accelerate | To use accelerate for model and data parallelism with transformers models | +| tgi | To use Text Generation Inference API to evaluate your model | +| nanotron | To evaluate nanotron models | +| quantization | To evaluate quantized models | +| adapters | To evaluate adapters models (delta and peft) | +| tensorboardX | To upload your results to tensorboard | +| vllm | To use vllm as backend for inference | +| s3 | To upload results to s3 | + + +## Hugging Face login + +If you want to push your results to the Hugging Face Hub or evaluate your own +private models, don't forget to add your access token to the environment +variable `HF_TOKEN`. You can do this by running: + +```bash +huggingface-cli login +``` diff --git a/docs/source/metric-list.mdx b/docs/source/metric-list.mdx new file mode 100644 index 000000000..0ab03afb9 --- /dev/null +++ b/docs/source/metric-list.mdx @@ -0,0 +1,76 @@ +# Metric List + +## Automatic metrics for multiple choice tasks + +These metrics use log-likelihood of the different possible targets. +- `loglikelihood_acc`: Fraction of instances where the choice with the best logprob was correct - also exists in a faster version for tasks where the possible choices include only one token (`loglikelihood_acc_single_token`) +- `loglikelihood_acc_norm`: Fraction of instances where the choice with the best logprob, normalized by sequence length, was correct - also exists in a faster version for tasks where the possible choices include only one token (`loglikelihood_acc_norm_single_token`) +- `loglikelihood_acc_norm_nospace`: Fraction of instances where the choice with the best logprob, normalized by sequence length, was correct, with the first space ignored +- `loglikelihood_f1`: Corpus level F1 score of the multichoice selection - also exists in a faster version for tasks where the possible choices include only one token (`loglikelihood_f1_single_token`) +- `mcc`: Matthew's correlation coefficient (a measure of agreement between statistical distributions), +- `recall_at_1`: Fraction of instances where the choice with the best logprob was correct - also exists in a faster version for tasks where the possible choices include only one token per choice (`recall_at_1_single_token`) +- `recall_at_2`: Fraction of instances where the choice with the 2nd best logprob or better was correct - also exists in a faster version for tasks where the possible choices include only one token per choice (`recall_at_2_single_token`) +- `mrr`: Mean reciprocal rank, a measure of the quality of a ranking of choices ordered by correctness/relevance - also exists in a faster version for tasks where the possible choices include only one token (`mrr_single_token`) +- `target_perplexity`: Perplexity of the different choices available. +- `acc_golds_likelihood`:: A bit different, it actually checks if the average logprob of a single target is above or below 0.5 +- `multi_f1_numeric`: Loglikelihood F1 score for multiple gold targets + +All these metrics also exist in a "single token" version (`loglikelihood_acc_single_token`, `loglikelihood_acc_norm_single_token`, `loglikelihood_f1_single_token`, `mcc_single_token`, `recall@2_single_token` and `mrr_single_token`). When the multichoice option compares only one token (ex: "A" vs "B" vs "C" vs "D", or "yes" vs "no"), using these metrics in the single token version will divide the time spent by the number of choices. Single token evals also include: +- `multi_f1_numeric`: computes the f1 score of all possible choices and averages it. + +## Automatic metrics for perplexity and language modeling +These metrics use log-likelihood of prompt. +- `word_perplexity`: Perplexity (log probability of the input) weighted by the number of words of the sequence. +- `byte_perplexity`: Perplexity (log probability of the input) weighted by the number of bytes of the sequence. +- `bits_per_byte`: Average number of bits per byte according to model probabilities. +- `log_prob`: Predicted output's average log probability (input's log prob for language modeling). + +## Automatic metrics for generative tasks +These metrics need the model to generate an output. They are therefore slower. +- Base: + - `perfect_exact_match`: Fraction of instances where the prediction matches the gold exactly. + - `exact_match`: Fraction of instances where the prediction matches the gold with the exception of the border whitespaces (= after a `strip` has been applied to both). + - `quasi_exact_match`: Fraction of instances where the normalized prediction matches the normalized gold (normalization done on whitespace, articles, capitalization, ...). Other variations exist, with other normalizers, such as `quasi_exact_match_triviaqa`, which only normalizes the predictions after applying a strip to all sentences. + - `prefix_exact_match`: Fraction of instances where the beginning of the prediction matches the gold at the exception of the border whitespaces (= after a `strip` has been applied to both). + - `prefix_quasi_exact_match`: Fraction of instances where the normalized beginning of the prediction matches the normalized gold (normalization done on whitespace, articles, capitalization, ...) + - `exact_match_indicator`: Exact match with some preceding context (before an indicator) removed + - `f1_score_quasi`: Average F1 score in terms of word overlap between the model output and gold, with both being normalized first + - `f1_score`: Average F1 score in terms of word overlap between the model output and gold without normalisation + - `f1_score_macro`: Corpus level macro F1 score + - `f1_score_macro`: Corpus level micro F1 score + - `maj_at_5` and `maj_at_8`: Model majority vote. Takes n (5 or 8) generations from the model and assumes the most frequent is the actual prediction. +- Summarization: + - `rouge`: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) + - `rouge1`: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 1-gram overlap. + - `rouge2`: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap. + - `rougeL`: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on longest common subsequence overlap. + - `rougeLsum`: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on longest common subsequence overlap. + - `rouge_t5` (BigBench): Corpus level ROUGE score for all available ROUGE metrics + - `faithfulness`: Faithfulness scores based on the SummaC method of [Laban et al. (2022)](https://aclanthology.org/2022.tacl-1.10/). + - `extractiveness`: Reports, based on [(Grusky et al., 2018)](https://aclanthology.org/N18-1065/) + - `summarization_coverage`: Extent to which the model-generated summaries are extractive fragments from the source document, + - `summarization_density`: Extent to which the model-generated summaries are extractive summaries based on the source document, + - `summarization_compression`: Extent to which the model-generated summaries are compressed relative to the source document. + - `bert_score`: Reports the average BERTScore precision, recall, and f1 score [(Zhang et al., 2020)](https://openreview.net/pdf?id=SkeHuCVFDr) between model generation and gold summary. + - Translation + - `bleu`: Corpus level BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) - uses the sacrebleu implementation. + - `bleu_1`: Average sample BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 1-gram overlap - uses the nltk implementation. + - `bleu_4`: Average sample BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap - uses the nltk implementation. + - `chrf`: Character n-gram matches f-score. + - `ter`: Translation edit/error rate. +- Copyright + - `copyright`: Reports: + - `longest_common_prefix_length`: average length of longest common prefix between model generation and reference, + - `edit_distance`: average Levenshtein edit distance between model generation and reference, + - `edit_similarity`: average Levenshtein edit similarity (normalized by length of longer sequence) between model generation and reference. +- Math: + - `quasi_exact_match_math`: Fraction of instances where the normalized prediction matches the normalized gold (normalization done for math, where latex symbols, units, etc are removed) + - `maj_at_4_math`: Majority choice evaluation, using the math normalisation for the predictions and gold + - `quasi_exact_match_gsm8k`: Fraction of instances where the normalized prediction matches the normalized gold (normalization done for gsm8k, where latex symbols, units, etc are removed) + - `maj_at_8_gsm8k`: Majority choice evaluation, using the gsm8k normalisation for the predictions and gold + +## LLM-as-Judge: +- `llm_judge_gpt3p5`: Can be used for any generative task, the model will be scored by a GPT3.5 model using the OpenAI API +- `llm_judge_llama_3_405b`: Can be used for any generative task, the model will be scored by a Llama 3.405B model using the HuggingFace API +- `llm_judge_multi_turn_gpt3p5`: Can be used for any generative task, the model will be scored by a GPT3.5 model using the OpenAI API. It is used for multiturn tasks like mt-bench. +- `llm_judge_multi_turn_llama_3_405b`: Can be used for any generative task, the model will be scored by a Llama 3.405B model using the HuggingFace API. It is used for multiturn tasks like mt-bench. diff --git a/docs/source/quicktour.mdx b/docs/source/quicktour.mdx new file mode 100644 index 000000000..5f66547e9 --- /dev/null +++ b/docs/source/quicktour.mdx @@ -0,0 +1,160 @@ +# Quicktour + +We provide two main entry points to evaluate models: + +- `lighteval accelerate` : evaluate models on CPU or one or more GPUs using [🤗 + Accelerate](https://github.com/huggingface/accelerate) +- `lighteval nanotron`: evaluate models in distributed settings using [⚡️ + Nanotron](https://github.com/huggingface/nanotron) + +## Accelerate + +### Evaluate a model on a GPU + +To evaluate `GPT-2` on the Truthful QA benchmark, run: + +```bash +lighteval accelerate \ + --model_args "pretrained=gpt2" \ + --tasks "leaderboard|truthfulqa:mc|0|0" \ + --override_batch_size 1 \ + --output_dir="./evals/" +``` + +Here, `--tasks` refers to either a comma-separated list of supported tasks from +the [tasks_list](available-tasks) in the format: + +```bash +{suite}|{task}|{num_few_shot}|{0 or 1 to automatically reduce `num_few_shot` if prompt is too long} +``` + +or a file path like +[examples/tasks/recommended_set.txt](https://github.com/huggingface/lighteval/blob/main/examples/tasks/recommended_set.txt) +which specifies multiple task configurations. + +Tasks details can be found in the +[file](https://github.com/huggingface/lighteval/blob/main/src/lighteval/tasks/default_tasks.py) +implementing them. + +### Evaluate a model on one or more GPUs + +#### Data parallelism + +To evaluate a model on one or more GPUs, first create a multi-gpu config by running. + +```bash +accelerate config +``` + +You can then evaluate a model using data parallelism on 8 GPUs like follows: + +```bash +accelerate launch --multi_gpu --num_processes=8 -m \ + lighteval accelerate \ + --model_args "pretrained=gpt2" \ + --tasks "leaderboard|truthfulqa:mc|0|0" \ + --override_batch_size 1 \ + --output_dir="./evals/" +``` + +Here, `--override_batch_size` defines the batch size per device, so the effective +batch size will be `override_batch_size * num_gpus`. + +#### Pipeline parallelism + +To evaluate a model using pipeline parallelism on 2 or more GPUs, run: + +```bash +lighteval accelerate \ + --model_args "pretrained=gpt2,model_parallel=True" \ + --tasks "leaderboard|truthfulqa:mc|0|0" \ + --override_batch_size 1 \ + --output_dir="./evals/" +``` + +This will automatically use accelerate to distribute the model across the GPUs. + +> [!TIP] +> Both data and pipeline parallelism can be combined by setting +> `model_parallel=True` and using accelerate to distribute the data across the +GPUs. + +### Model Arguments + +The `--model_args` argument takes a string representing a list of model +argument. The arguments allowed vary depending on the backend you use (vllm or +accelerate). + +#### Accelerate + +- **pretrained** (str): + HuggingFace Hub model ID name or the path to a pre-trained + model to load. This is effectively the `pretrained_model_name_or_path` + argument of `from_pretrained` in the HuggingFace `transformers` API. +- **tokenizer** (Optional[str]): HuggingFace Hub tokenizer ID that will be + used for tokenization. +- **multichoice_continuations_start_space** (Optional[bool]): Whether to add a + space at the start of each continuation in multichoice generation. + For example, context: "What is the capital of France?" and choices: "Paris", "London". + Will be tokenized as: "What is the capital of France? Paris" and "What is the capital of France? London". + True adds a space, False strips a space, None does nothing +- **subfolder** (Optional[str]): The subfolder within the model repository. +- **revision** (str): The revision of the model. +- **max_gen_toks** (Optional[int]): The maximum number of tokens to generate. +- **max_length** (Optional[int]): The maximum length of the generated output. +- **add_special_tokens** (bool, optional, defaults to True): Whether to add special tokens to the input sequences. + If `None`, the default value will be set to `True` for seq2seq models (e.g. T5) and + `False` for causal models. +- **model_parallel** (bool, optional, defaults to False): + True/False: force to use or not the `accelerate` library to load a large + model across multiple devices. + Default: None which corresponds to comparing the number of processes with + the number of GPUs. If it's smaller => model-parallelism, else not. +- **dtype** (Union[str, torch.dtype], optional, defaults to None):): + Converts the model weights to `dtype`, if specified. Strings get + converted to `torch.dtype` objects (e.g. `float16` -> `torch.float16`). + Use `dtype="auto"` to derive the type from the model's weights. +- **device** (Union[int, str]): device to use for model training. +- **quantization_config** (Optional[BitsAndBytesConfig]): quantization + configuration for the model, manually provided to load a normally floating point + model at a quantized precision. Needed for 4-bit and 8-bit precision. +- **trust_remote_code** (bool): Whether to trust remote code during model + loading. + +#### VLLM + +- **pretrained** (str): HuggingFace Hub model ID name or the path to a pre-trained model to load. +- **gpu_memory_utilisation** (float): The fraction of GPU memory to use. +- **batch_size** (int): The batch size for model training. +- **revision** (str): The revision of the model. +- **dtype** (str, None): The data type to use for the model. +- **tensor_parallel_size** (int): The number of tensor parallel units to use. +- **data_parallel_size** (int): The number of data parallel units to use. +- **max_model_length** (int): The maximum length of the model. +- **swap_space** (int): The CPU swap space size (GiB) per GPU. +- **seed** (int): The seed to use for the model. +- **trust_remote_code** (bool): Whether to trust remote code during model loading. +- **use_chat_template** (bool): Whether to use the chat template or not. +- **add_special_tokens** (bool): Whether to add special tokens to the input sequences. +- **multichoice_continuations_start_space** (bool): Whether to add a space at the start of each continuation in multichoice generation. +- **subfolder** (Optional[str]): The subfolder within the model repository. + +## Nanotron + +To evaluate a model trained with nanotron on a single gpu. + +> [!WARNING] +> Nanotron models cannot be evaluated without torchrun. + + +```bash + torchrun --standalone --nnodes=1 --nproc-per-node=1 \ + src/lighteval/__main__.py nanotron \ + --checkpoint_config_path ../nanotron/checkpoints/10/config.yaml \ + --lighteval_config_path examples/nanotron/lighteval_config_override_template.yaml + ``` + +The `nproc-per-node` argument should match the data, tensor and pipeline +parallelism confidured in the `lighteval_config_template.yaml` file. +That is: `nproc-per-node = data_parallelism * tensor_parallelism * +pipeline_parallelism`. diff --git a/docs/source/saving-and-reading-results.mdx b/docs/source/saving-and-reading-results.mdx new file mode 100644 index 000000000..b50cdee6c --- /dev/null +++ b/docs/source/saving-and-reading-results.mdx @@ -0,0 +1,214 @@ +# Saving and reading results + +## Saving results locally + +Lighteval will automatically save results and evaluation details in the +directory set with the `--output_dir` argument. The results will be saved in +`{output_dir}/results/{model_name}/results_{timestamp}.json`. [Here is an +example of a result file](#example-of-a-result-file). The output path can be +any [fsspec](https://filesystem-spec.readthedocs.io/en/latest/index.html) +compliant path (local, s3, hf hub, gdrive, ftp, etc). + +To save the details of the evaluation, you can use the `--save_details` +argument. The details will be saved in a parquet file +`{output_dir}/details/{model_name}/{timestamp}/details_{task}_{timestamp}.parquet`. + +## Pushing results to the HuggingFace hub + +You can push the results and evaluation details to the HuggingFace hub. To do +so, you need to set the `--push_to_hub` as well as the `--results_org` +argument. The results will be saved in a dataset with the name at +`{results_org}/{model_org}/{model_name}`. To push the details, you need to set +the `--save_details` argument. +The dataset created will be private by default, you can make it public by +setting the `--public_run` argument. + + +## Pushing results to Tensorboard + +You can push the results to Tensorboard by setting `--push_to_tensorboard`. + + +## How to load and investigate details + +### Load from local detail files + +```python +from datasets import load_dataset +import os + +output_dir = "evals_doc" +model_name = "HuggingFaceH4/zephyr-7b-beta" +timestamp = "latest" +task = "lighteval|gsm8k|0" + +if timestamp == "latest": + path = f"{output_dir}/details/{model_org}/{model_name}/*/" + timestamps = glob.glob(path) + timestamp = sorted(timestamps)[-1].split("/")[-2] + print(f"Latest timestamp: {timestamp}") + +details_path = f"{output_dir}/details/{model_name}/{timestamp}/details_{task}_{timestamp}.parquet" + +# Load the details +details = load_dataset("parquet", data_files=details_path, split="train") + +for detail in details: + print(detail) +``` + +### Load from the HuggingFace hub + +```python +from datasets import load_dataset + +results_org = "SaylorTwift" +model_name = "HuggingFaceH4/zephyr-7b-beta" +sanitized_model_name = model_name.replace("/", "__") +task = "lighteval|gsm8k|0" +public_run = False + +dataset_path = f"{results_org}/details_{sanitized_model_name}{'_private' if not public_run else ''}" +details = load_dataset(dataset_path, task.replace("|", "_"), split="latest") + +for detail in details: + print(detail) +``` + + +The detail file contains the following columns: +- `choices`: The choices presented to the model in the case of mutlichoice tasks. +- `gold`: The gold answer. +- `gold_index`: The index of the gold answer in the choices list. +- `cont_tokens`: The continuation tokens. +- `example`: The input in text form. +- `full_prompt`: The full prompt, that will be inputed to the model. +- `input_tokens`: The tokens of the full prompt. +- `instruction`: The instruction given to the model. +- `metrics`: The metrics computed for the example. +- `num_asked_few_shots`: The number of few shots asked to the model. +- `num_effective_few_shots`: The number of effective few shots. +- `padded`: Whether the input was padded. +- `pred_logits`: The logits of the model. +- `predictions`: The predictions of the model. +- `specifics`: The specifics of the task. +- `truncated`: Whether the input was truncated. + + +## Example of a result file + +```json +{ + "config_general": { + "lighteval_sha": "203045a8431bc9b77245c9998e05fc54509ea07f", + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": 1, + "job_id": "", + "start_time": 620979.879320166, + "end_time": 621004.632108041, + "total_evaluation_time_secondes": "24.752787875011563", + "model_name": "gpt2", + "model_sha": "607a30d783dfa663caf39e06633721c8d4cfcd7e", + "model_dtype": null, + "model_size": "476.2 MB" + }, + "results": { + "lighteval|gsm8k|0": { + "qem": 0.0, + "qem_stderr": 0.0, + "maj@8": 0.0, + "maj@8_stderr": 0.0 + }, + "all": { + "qem": 0.0, + "qem_stderr": 0.0, + "maj@8": 0.0, + "maj@8_stderr": 0.0 + } + }, + "versions": { + "lighteval|gsm8k|0": 0 + }, + "config_tasks": { + "lighteval|gsm8k": { + "name": "gsm8k", + "prompt_function": "gsm8k", + "hf_repo": "gsm8k", + "hf_subset": "main", + "metric": [ + { + "metric_name": "qem", + "higher_is_better": true, + "category": "3", + "use_case": "5", + "sample_level_fn": "compute", + "corpus_level_fn": "mean" + }, + { + "metric_name": "maj@8", + "higher_is_better": true, + "category": "5", + "use_case": "5", + "sample_level_fn": "compute", + "corpus_level_fn": "mean" + } + ], + "hf_avail_splits": [ + "train", + "test" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": null, + "few_shots_select": "random_sampling_from_train", + "generation_size": 256, + "generation_grammar": null, + "stop_sequence": [ + "Question=" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "lighteval" + ], + "original_num_docs": 1319, + "effective_num_docs": 1, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + } + }, + "summary_tasks": { + "lighteval|gsm8k|0": { + "hashes": { + "hash_examples": "8517d5bf7e880086", + "hash_full_prompts": "8517d5bf7e880086", + "hash_input_tokens": "29916e7afe5cb51d", + "hash_cont_tokens": "37f91ce23ef6d435" + }, + "truncated": 2, + "non_truncated": 0, + "padded": 0, + "non_padded": 2, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "5f383c395f01096e", + "hash_full_prompts": "5f383c395f01096e", + "hash_input_tokens": "ac933feb14f96d7b", + "hash_cont_tokens": "9d03fb26f8da7277" + }, + "truncated": 2, + "non_truncated": 0, + "padded": 0, + "non_padded": 2, + "num_truncated_few_shots": 0 + } +} +``` diff --git a/docs/source/use-vllm-as-backend.mdx b/docs/source/use-vllm-as-backend.mdx new file mode 100644 index 000000000..153ff659f --- /dev/null +++ b/docs/source/use-vllm-as-backend.mdx @@ -0,0 +1,53 @@ +# Use VLLM as backend + +Lighteval allows you to use `vllm` as backend allowing great speedups. +To use, simply change the `model_args` to reflect the arguments you want to pass to vllm. + +```bash +lighteval accelerate \ + --model_args="vllm,pretrained=HuggingFaceH4/zephyr-7b-beta,dtype=float16" \ + --tasks "leaderboard|truthfulqa:mc|0|0" \ + --output_dir="./evals/" +``` + +`vllm` is able to distribute the model across multiple GPUs using data +parallelism, pipeline parallelism or tensor parallelism. +You can choose the parallelism method by setting in the the `model_args`. + +For example if you have 4 GPUs you can split it across using `tensor_parallelism`: + +```bash +export VLLM_WORKER_MULTIPROC_METHOD=spawn && lighteval accelerate \ + --model_args="vllm,pretrained=HuggingFaceH4/zephyr-7b-beta,dtype=float16,tensor_parallel_size=4" \ + --tasks "leaderboard|truthfulqa:mc|0|0" \ + --output_dir="./evals/" +``` + +Or, if your model fits on a single GPU, you can use `data_parallelism` to speed up the evaluation: + +```bash +lighteval accelerate \ + --model_args="vllm,pretrained=HuggingFaceH4/zephyr-7b-beta,dtype=float16,data_parallel_size=4" \ + --tasks "leaderboard|truthfulqa:mc|0|0" \ + --output_dir="./evals/" +``` + +Available arguments for `vllm` can be found in the `VLLMModelConfig`: + +- **pretrained** (str): HuggingFace Hub model ID name or the path to a pre-trained model to load. +- **gpu_memory_utilisation** (float): The fraction of GPU memory to use. +- **revision** (str): The revision of the model. +- **dtype** (str, None): The data type to use for the model. +- **tensor_parallel_size** (int): The number of tensor parallel units to use. +- **data_parallel_size** (int): The number of data parallel units to use. +- **max_model_length** (int): The maximum length of the model. +- **swap_space** (int): The CPU swap space size (GiB) per GPU. +- **seed** (int): The seed to use for the model. +- **trust_remote_code** (bool): Whether to trust remote code during model loading. +- **add_special_tokens** (bool): Whether to add special tokens to the input sequences. +- **multichoice_continuations_start_space** (bool): Whether to add a space at the start of each continuation in multichoice generation. + +> [!WARNING] +> In the case of OOM issues, you might need to reduce the context size of the +> model as well as reduce the `gpu_memory_utilisation` parameter. + diff --git a/docs/source/using-the-python-api.mdx b/docs/source/using-the-python-api.mdx new file mode 100644 index 000000000..82238c7f1 --- /dev/null +++ b/docs/source/using-the-python-api.mdx @@ -0,0 +1,62 @@ +# Using the Python API + +Lighteval can be used from a custom python script. To evaluate a model you will +need to setup an `evaluation_tracker`, `pipeline_parameters`, `model_config` +and a `pipeline`. + +After that, simply run the pipeline and save the results. + + +```python +import lighteval +from lighteval.logging.evaluation_tracker import EvaluationTracker +from lighteval.models.model_config import VLLMModelConfig +from lighteval.pipeline import ParallelismManager, Pipeline, PipelineParameters +from lighteval.utils.utils import EnvConfig +from lighteval.utils.imports import is_accelerate_available + +if is_accelerate_available(): + from accelerate import Accelerator, InitProcessGroupKwargs + accelerator = Accelerator(kwargs_handlers=[InitProcessGroupKwargs(timeout=timedelta(seconds=3000))]) +else: + accelerator = None + +def main(): + evaluation_tracker = EvaluationTracker( + output_dir="./results", + save_details=True, + push_to_hub=True, + hub_results_org="your user name", + ) + + pipeline_params = PipelineParameters( + launcher_type=ParallelismManager.ACCELERATE, + env_config=EnvConfig(cache_dir="tmp/"), + # Remove the 2 parameters below once your configuration is tested + override_batch_size=1, + max_samples=10 + ) + + model_config = VLLMModelConfig( + pretrained="HuggingFaceH4/zephyr-7b-beta", + dtype="float16", + use_chat_template=True, + ) + + task = "helm|mmlu|5|1" + + pipeline = Pipeline( + tasks=task, + pipeline_parameters=pipeline_params, + evaluation_tracker=evaluation_tracker, + model_config=model_config, + custom_task_directory=None, # if using a custom task + ) + + pipeline.evaluate() + pipeline.save_and_push_results() + pipeline.show_results() + +if __name__ == "__main__": + main() +``` diff --git a/pyproject.toml b/pyproject.toml index e736be66a..a779ebf4c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -92,6 +92,7 @@ vllm = ["vllm", "ray", "more_itertools"] quality = ["ruff==v0.2.2","pre-commit"] tests = ["pytest==7.4.0"] dev = ["lighteval[accelerate,quality,tests,multilingual]"] +docs = ["hf-doc-builder", "watchdog"] extended_tasks = [ "langdetect", # ifeval "openai", # llm as a judge using openai models