From 0c808016f2937360e5c4e11f38e4c0372e704861 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
<8515462+albertvillanova@users.noreply.github.com>
Date: Thu, 28 Nov 2024 20:29:54 +0100
Subject: [PATCH] Set up docs (#403)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
* Add docs
* Add wiki to docs
* Adapt wiki as docs
* Force docs build
* Fix link in _toctree
* Add titles to docs pages
* Update docs/source/evaluate-the-model-on-a-server-or-container.mdx
Co-authored-by: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
---------
Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com>
Co-authored-by: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
---
.github/workflows/doc-build.yml | 18 +
.github/workflows/doc-pr-build.yml | 16 +
docs/source/_toctree.yml | 30 +
docs/source/adding-a-custom-task.mdx | 196 +++
docs/source/adding-a-new-metric.mdx | 93 ++
docs/source/available-tasks.mdx | 1250 +++++++++++++++++
...ntributing-to-multilingual-evaluations.mdx | 107 ++
...ate-the-model-on-a-server-or-container.mdx | 67 +
docs/source/index.mdx | 18 +
docs/source/installation.mdx | 46 +
docs/source/metric-list.mdx | 76 +
docs/source/quicktour.mdx | 160 +++
docs/source/saving-and-reading-results.mdx | 214 +++
docs/source/use-vllm-as-backend.mdx | 53 +
docs/source/using-the-python-api.mdx | 62 +
pyproject.toml | 1 +
16 files changed, 2407 insertions(+)
create mode 100644 .github/workflows/doc-build.yml
create mode 100644 .github/workflows/doc-pr-build.yml
create mode 100644 docs/source/_toctree.yml
create mode 100644 docs/source/adding-a-custom-task.mdx
create mode 100644 docs/source/adding-a-new-metric.mdx
create mode 100644 docs/source/available-tasks.mdx
create mode 100644 docs/source/contributing-to-multilingual-evaluations.mdx
create mode 100644 docs/source/evaluate-the-model-on-a-server-or-container.mdx
create mode 100644 docs/source/index.mdx
create mode 100644 docs/source/installation.mdx
create mode 100644 docs/source/metric-list.mdx
create mode 100644 docs/source/quicktour.mdx
create mode 100644 docs/source/saving-and-reading-results.mdx
create mode 100644 docs/source/use-vllm-as-backend.mdx
create mode 100644 docs/source/using-the-python-api.mdx
diff --git a/.github/workflows/doc-build.yml b/.github/workflows/doc-build.yml
new file mode 100644
index 000000000..cd345d3d3
--- /dev/null
+++ b/.github/workflows/doc-build.yml
@@ -0,0 +1,18 @@
+name: Build Documentation
+
+on:
+ push:
+ branches:
+ - main
+ - doc-builder*
+ - v*-release
+
+jobs:
+ build:
+ uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main
+ with:
+ commit_sha: ${{ github.sha }}
+ package: lighteval
+ secrets:
+ token: ${{ secrets.HUGGINGFACE_PUSH }}
+ hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
diff --git a/.github/workflows/doc-pr-build.yml b/.github/workflows/doc-pr-build.yml
new file mode 100644
index 000000000..f96e20583
--- /dev/null
+++ b/.github/workflows/doc-pr-build.yml
@@ -0,0 +1,16 @@
+name: Build PR Documentation
+
+on:
+ pull_request:
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+ cancel-in-progress: true
+
+jobs:
+ build:
+ uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
+ with:
+ commit_sha: ${{ github.event.pull_request.head.sha }}
+ pr_number: ${{ github.event.number }}
+ package: lighteval
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
new file mode 100644
index 000000000..243462b3d
--- /dev/null
+++ b/docs/source/_toctree.yml
@@ -0,0 +1,30 @@
+- sections:
+ - local: index
+ title: 🤗 Lighteval
+ - local: installation
+ title: Installation
+ - local: quicktour
+ title: Quicktour
+ title: Getting started
+- sections:
+ - local: saving-and-reading-results
+ title: Save and read results
+ - local: using-the-python-api
+ title: Use the Python API
+ - local: adding-a-custom-task
+ title: Add a custom task
+ - local: adding-a-new-metric
+ title: Add a custom metric
+ - local: use-vllm-as-backend
+ title: Use VLLM as backend
+ - local: evaluate-the-model-on-a-server-or-container
+ title: Evaluate on Server
+ - local: contributing-to-multilingual-evaluations
+ title: Contributing to multilingual evaluations
+ title: Guides
+- sections:
+ - local: metric-list
+ title: Available Metrics
+ - local: available-tasks
+ title: Available Tasks
+ title: API
diff --git a/docs/source/adding-a-custom-task.mdx b/docs/source/adding-a-custom-task.mdx
new file mode 100644
index 000000000..bcaa932ff
--- /dev/null
+++ b/docs/source/adding-a-custom-task.mdx
@@ -0,0 +1,196 @@
+# Adding a Custom Task
+
+To add a new task, first either open an issue, to determine whether it will be
+integrated in the core evaluations of lighteval, in the extended tasks, or the
+community tasks, and add its dataset on the hub.
+
+- Core evaluations are evaluations that only require standard logic in their
+ metrics and processing, and that we will add to our test suite to ensure non
+ regression through time. They already see high usage in the community.
+- Extended evaluations are evaluations that require custom logic in their
+ metrics (complex normalisation, an LLM as a judge, ...), that we added to
+ facilitate the life of users. They already see high usage in the community.
+- Community evaluations are submissions by the community of new tasks.
+
+A popular community evaluation can move to become an extended or core evaluation over time.
+
+> [!TIP]
+> You can find examples of custom tasks in the community_task directory.
+
+## Step by step creation of a custom task
+
+> [!WARNING]
+> To contribute your custom metric to the lighteval repo, you would first need
+> to install the required dev dependencies by running `pip install -e .[dev]`
+> and then run `pre-commit install` to install the pre-commit hooks.
+
+First, create a python file under the `community_tasks` directory.
+
+You need to define a prompt function that will convert a line from your
+dataset to a document to be used for evaluation.
+
+```python
+# Define as many as you need for your different tasks
+def prompt_fn(line, task_name: str = None):
+ """Defines how to go from a dataset line to a doc object.
+ Follow examples in src/lighteval/tasks/default_prompts.py, or get more info
+ about what this function should do in the README.
+ """
+ return Doc(
+ task_name=task_name,
+ query=line["question"],
+ choices=[f" {c}" for c in line["choices"]],
+ gold_index=line["gold"],
+ instruction="",
+ )
+```
+
+Then, you need to choose a metric, you can either use an existing one (defined
+in `lighteval/metrics/metrics.py`) or [create a custom one](adding-a-new-metric)).
+
+```python
+custom_metric = SampleLevelMetric(
+ metric_name="my_custom_metric_name",
+ higher_is_better=True,
+ category=MetricCategory.IGNORED,
+ use_case=MetricUseCase.NONE,
+ sample_level_fn=lambda x: x, # how to compute score for one sample
+ corpus_level_fn=np.mean, # How to aggreagte the samples metrics
+)
+```
+
+Then, you need to define your task. You can define a task with or without subsets.
+To define a task with no subsets:
+
+```python
+# This is how you create a simple task (like hellaswag) which has one single subset
+# attached to it, and one evaluation possible.
+task = LightevalTaskConfig(
+ name="myothertask",
+ prompt_function=prompt_fn, # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
+ suite=["community"],
+ hf_repo="",
+ hf_subset="default",
+ hf_avail_splits=[],
+ evaluation_splits=[],
+ few_shots_split=None,
+ few_shots_select=None,
+ metric=[], # select your metric in Metrics
+)
+```
+
+If you want to create a task with multiple subset, add them to the
+`SAMPLE_SUBSETS` list and create a task for each subset.
+
+```python
+SAMPLE_SUBSETS = [] # list of all the subsets to use for this eval
+
+
+class CustomSubsetTask(LightevalTaskConfig):
+ def __init__(
+ self,
+ name,
+ hf_subset,
+ ):
+ super().__init__(
+ name=name,
+ hf_subset=hf_subset,
+ prompt_function=prompt_fn, # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
+ hf_repo="",
+ metric=[custom_metric], # select your metric in Metrics or use your custom_metric
+ hf_avail_splits=[],
+ evaluation_splits=[],
+ few_shots_split=None,
+ few_shots_select=None,
+ suite=["community"],
+ generation_size=-1,
+ stop_sequence=None,
+ output_regex=None,
+ frozen=False,
+ )
+SUBSET_TASKS = [CustomSubsetTask(name=f"mytask:{subset}", hf_subset=subset) for subset in SAMPLE_SUBSETS]
+```
+
+Here is a list of the parameters and their meaning:
+
+- `name` (str), your evaluation name
+- `suite` (list), the suite(s) to which your evaluation should belong. This
+ field allows us to compare different task implementations and is used as a
+ task selection to differentiate the versions to launch. At the moment, you'll
+ find the keywords ["helm", "bigbench", "original", "lighteval", "community",
+ "custom"]; for core evals, please choose `lighteval`.
+- `prompt_function` (Callable), the prompt function you defined in the step
+ above
+- `hf_repo` (str), the path to your evaluation dataset on the hub
+- `hf_subset` (str), the specific subset you want to use for your evaluation
+ (note: when the dataset has no subset, fill this field with `"default"`, not
+ with `None` or `""`)
+- `hf_avail_splits` (list), all the splits available for your dataset (train,
+ valid or validation, test, other...)
+- `evaluation_splits` (list), the splits you want to use for evaluation
+- `few_shots_split` (str, can be `null`), the specific split from which you
+ want to select samples for your few-shot examples. It should be different
+ from the sets included in `evaluation_splits`
+- `few_shots_select` (str, can be `null`), the method that you will use to
+ select items for your few-shot examples. Can be `null`, or one of:
+ - `balanced` select examples from the `few_shots_split` with balanced
+ labels, to avoid skewing the few shot examples (hence the model
+ generations) toward one specific label
+ - `random` selects examples at random from the `few_shots_split`
+ - `random_sampling` selects new examples at random from the
+ `few_shots_split` for every new item, but if a sampled item is equal to
+ the current one, it is removed from the available samples
+ - `random_sampling_from_train` selects new examples at random from the
+ `few_shots_split` for every new item, but if a sampled item is equal to
+ the current one, it is kept! Only use this if you know what you are
+ doing.
+ - `sequential` selects the first `n` examples of the `few_shots_split`
+- `generation_size` (int), the maximum number of tokens allowed for a
+ generative evaluation. If your evaluation is a log likelihood evaluation
+ (multi-choice), this value should be -1
+- `stop_sequence` (list), a list of strings acting as end of sentence tokens
+ for your generation
+- `metric` (list), the metrics you want to use for your evaluation (see next
+ section for a detailed explanation)
+- `output_regex` (str), A regex string that will be used to filter your
+ generation. (Generative metrics will only select tokens that are between the
+ first and the second sequence matched by the regex. For example, for a regex
+ matching `\n` and a generation `\nModel generation output\nSome other text`
+ the metric will only be fed with `Model generation output`)
+- `frozen` (bool), for now, is set to False, but we will steadily pass all
+ stable tasks to True.
+- `trust_dataset` (bool), set to True if you trust the dataset.
+
+
+Then you need to add your task to the `TASKS_TABLE` list.
+
+```python
+# STORE YOUR EVALS
+
+# tasks with subset:
+TASKS_TABLE = SUBSET_TASKS
+
+# tasks without subset:
+# TASKS_TABLE = [task]
+```
+
+Finally, you need to add a module logic to convert your task to a dict for lighteval.
+
+```python
+# MODULE LOGIC
+# You should not need to touch this
+# Convert to dict for lighteval
+if __name__ == "__main__":
+ print(t.name for t in TASKS_TABLE)
+ print(len(TASKS_TABLE))
+```
+
+Once your file is created you can then run the evaluation with the following command:
+
+```bash
+lighteval accelerate \
+ --model_args "pretrained=HuggingFaceH4/zephyr-7b-beta" \
+ --tasks "community|{custom_task}|{fewshots}|{truncate_few_shot}" \
+ --custom_tasks {path_to_your_custom_task_file} \
+ --output_dir "./evals"
+```
diff --git a/docs/source/adding-a-new-metric.mdx b/docs/source/adding-a-new-metric.mdx
new file mode 100644
index 000000000..e8562af4f
--- /dev/null
+++ b/docs/source/adding-a-new-metric.mdx
@@ -0,0 +1,93 @@
+# Adding a New Metric
+
+First, check if you can use one of the parametrized functions in
+[src.lighteval.metrics.metrics_corpus]() or
+[src.lighteval.metrics.metrics_sample]().
+
+If not, you can use the `custom_task` system to register your new metric:
+
+> [!TIP]
+> To see an example of a custom metric added along with a custom task, look at the IFEval custom task.
+
+
+> [!WARNING]
+> To contribute your custom metric to the lighteval repo, you would first need
+> to install the required dev dependencies by running `pip install -e .[dev]`
+> and then run `pre-commit install` to install the pre-commit hooks.
+
+
+- Create a new Python file which should contain the full logic of your metric.
+- The file also needs to start with these imports
+
+```python
+from aenum import extend_enum
+from lighteval.metrics import Metrics
+```
+
+You need to define a sample level metric:
+
+```python
+def custom_metric(predictions: list[str], formatted_doc: Doc, **kwargs) -> bool:
+ response = predictions[0]
+ return response == formatted_doc.choices[formatted_doc.gold_index]
+```
+
+Here the sample level metric only returns one metric, if you want to return multiple metrics per sample you need to return a dictionary with the metrics as keys and the values as values.
+
+```python
+def custom_metric(predictions: list[str], formatted_doc: Doc, **kwargs) -> dict:
+ response = predictions[0]
+ return {"accuracy": response == formatted_doc.choices[formatted_doc.gold_index], "other_metric": 0.5}
+```
+
+Then, you can define an aggregation function if needed, a common aggregation function is `np.mean`.
+
+```python
+def agg_function(items):
+ flat_items = [item for sublist in items for item in sublist]
+ score = sum(flat_items) / len(flat_items)
+ return score
+```
+
+Finally, you can define your metric. If it's a sample level metric, you can use the following code:
+
+```python
+my_custom_metric = SampleLevelMetric(
+ metric_name={custom_metric_name},
+ higher_is_better={either True or False},
+ category={MetricCategory},
+ use_case={MetricUseCase},
+ sample_level_fn=custom_metric,
+ corpus_level_fn=agg_function,
+)
+```
+
+If your metric defines multiple metrics per sample, you can use the following code:
+
+```python
+custom_metric = SampleLevelMetricGrouping(
+ metric_name={submetric_names},
+ higher_is_better={n: {True or False} for n in submetric_names},
+ category={MetricCategory},
+ use_case={MetricUseCase},
+ sample_level_fn=custom_metric,
+ corpus_level_fn={
+ "accuracy": np.mean,
+ "other_metric": agg_function,
+ },
+)
+```
+
+To finish, add the following, so that it adds your metric to our metrics list
+when loaded as a module.
+
+```python
+# Adds the metric to the metric list!
+extend_enum(Metrics, "metric_name", metric_function)
+if __name__ == "__main__":
+ print("Imported metric")
+```
+
+You can then give your custom metric to lighteval by using `--custom-tasks
+path_to_your_file` when launching it.
+
diff --git a/docs/source/available-tasks.mdx b/docs/source/available-tasks.mdx
new file mode 100644
index 000000000..9b167d21e
--- /dev/null
+++ b/docs/source/available-tasks.mdx
@@ -0,0 +1,1250 @@
+# Available Tasks
+
+You can get a list of all the available tasks by running:
+
+```bash
+lighteval tasks --list
+```
+
+## List of tasks
+
+- bigbench:
+ - bigbench|abstract_narrative_understanding
+ - bigbench|anachronisms
+ - bigbench|analogical_similarity
+ - bigbench|analytic_entailment
+ - bigbench|arithmetic_bb
+ - bigbench|ascii_word_recognition
+ - bigbench|authorship_verification
+ - bigbench|auto_categorization
+ - bigbench|auto_debugging
+ - bigbench|bbq_lite_json
+ - bigbench|bridging_anaphora_resolution_barqa
+ - bigbench|causal_judgment
+ - bigbench|cause_and_effect
+ - bigbench|checkmate_in_one
+ - bigbench|chess_state_tracking
+ - bigbench|chinese_remainder_theorem
+ - bigbench|cifar10_classification
+ - bigbench|code_line_description
+ - bigbench|codenames
+ - bigbench|color
+ - bigbench|common_morpheme
+ - bigbench|conceptual_combinations
+ - bigbench|conlang_translation
+ - bigbench|contextual_parametric_knowledge_conflicts
+ - bigbench|coqa_bb
+ - bigbench|crash_blossom
+ - bigbench|crass_ai
+ - bigbench|cryobiology_spanish
+ - bigbench|cryptonite
+ - bigbench|cs_algorithms
+ - bigbench|dark_humor_detection
+ - bigbench|date_understanding
+ - bigbench|disambiguation_qa
+ - bigbench|discourse_marker_prediction
+ - bigbench|disfl_qa
+ - bigbench|dyck_languages
+ - bigbench|elementary_math_qa
+ - bigbench|emoji_movie
+ - bigbench|emojis_emotion_prediction
+ - bigbench|empirical_judgments
+ - bigbench|english_proverbs
+ - bigbench|english_russian_proverbs
+ - bigbench|entailed_polarity
+ - bigbench|entailed_polarity_hindi
+ - bigbench|epistemic_reasoning
+ - bigbench|evaluating_information_essentiality
+ - bigbench|fact_checker
+ - bigbench|fantasy_reasoning
+ - bigbench|few_shot_nlg
+ - bigbench|figure_of_speech_detection
+ - bigbench|formal_fallacies_syllogisms_negation
+ - bigbench|gem
+ - bigbench|gender_inclusive_sentences_german
+ - bigbench|general_knowledge
+ - bigbench|geometric_shapes
+ - bigbench|goal_step_wikihow
+ - bigbench|gre_reading_comprehension
+ - bigbench|hhh_alignment
+ - bigbench|hindi_question_answering
+ - bigbench|hindu_knowledge
+ - bigbench|hinglish_toxicity
+ - bigbench|human_organs_senses
+ - bigbench|hyperbaton
+ - bigbench|identify_math_theorems
+ - bigbench|identify_odd_metaphor
+ - bigbench|implicatures
+ - bigbench|implicit_relations
+ - bigbench|intent_recognition
+ - bigbench|international_phonetic_alphabet_nli
+ - bigbench|international_phonetic_alphabet_transliterate
+ - bigbench|intersect_geometry
+ - bigbench|irony_identification
+ - bigbench|kanji_ascii
+ - bigbench|kannada
+ - bigbench|key_value_maps
+ - bigbench|known_unknowns
+ - bigbench|language_games
+ - bigbench|language_identification
+ - bigbench|linguistic_mappings
+ - bigbench|linguistics_puzzles
+ - bigbench|logic_grid_puzzle
+ - bigbench|logical_args
+ - bigbench|logical_deduction
+ - bigbench|logical_fallacy_detection
+ - bigbench|logical_sequence
+ - bigbench|mathematical_induction
+ - bigbench|matrixshapes
+ - bigbench|metaphor_boolean
+ - bigbench|metaphor_understanding
+ - bigbench|minute_mysteries_qa
+ - bigbench|misconceptions
+ - bigbench|misconceptions_russian
+ - bigbench|mnist_ascii
+ - bigbench|modified_arithmetic
+ - bigbench|moral_permissibility
+ - bigbench|movie_dialog_same_or_different
+ - bigbench|movie_recommendation
+ - bigbench|mult_data_wrangling
+ - bigbench|multiemo
+ - bigbench|natural_instructions
+ - bigbench|navigate
+ - bigbench|nonsense_words_grammar
+ - bigbench|novel_concepts
+ - bigbench|object_counting
+ - bigbench|odd_one_out
+ - bigbench|operators
+ - bigbench|paragraph_segmentation
+ - bigbench|parsinlu_qa
+ - bigbench|parsinlu_reading_comprehension
+ - bigbench|penguins_in_a_table
+ - bigbench|periodic_elements
+ - bigbench|persian_idioms
+ - bigbench|phrase_relatedness
+ - bigbench|physical_intuition
+ - bigbench|physics
+ - bigbench|physics_questions
+ - bigbench|play_dialog_same_or_different
+ - bigbench|polish_sequence_labeling
+ - bigbench|presuppositions_as_nli
+ - bigbench|qa_wikidata
+ - bigbench|question_selection
+ - bigbench|real_or_fake_text
+ - bigbench|reasoning_about_colored_objects
+ - bigbench|repeat_copy_logic
+ - bigbench|rephrase
+ - bigbench|rhyming
+ - bigbench|riddle_sense
+ - bigbench|ruin_names
+ - bigbench|salient_translation_error_detection
+ - bigbench|scientific_press_release
+ - bigbench|semantic_parsing_in_context_sparc
+ - bigbench|semantic_parsing_spider
+ - bigbench|sentence_ambiguity
+ - bigbench|similarities_abstraction
+ - bigbench|simp_turing_concept
+ - bigbench|simple_arithmetic_json
+ - bigbench|simple_arithmetic_json_multiple_choice
+ - bigbench|simple_arithmetic_json_subtasks
+ - bigbench|simple_arithmetic_multiple_targets_json
+ - bigbench|simple_ethical_questions
+ - bigbench|simple_text_editing
+ - bigbench|snarks
+ - bigbench|social_iqa
+ - bigbench|social_support
+ - bigbench|sports_understanding
+ - bigbench|strange_stories
+ - bigbench|strategyqa
+ - bigbench|sufficient_information
+ - bigbench|suicide_risk
+ - bigbench|swahili_english_proverbs
+ - bigbench|swedish_to_german_proverbs
+ - bigbench|symbol_interpretation
+ - bigbench|tellmewhy
+ - bigbench|temporal_sequences
+ - bigbench|tense
+ - bigbench|timedial
+ - bigbench|topical_chat
+ - bigbench|tracking_shuffled_objects
+ - bigbench|understanding_fables
+ - bigbench|undo_permutation
+ - bigbench|unit_conversion
+ - bigbench|unit_interpretation
+ - bigbench|unnatural_in_context_learning
+ - bigbench|vitaminc_fact_verification
+ - bigbench|what_is_the_tao
+ - bigbench|which_wiki_edit
+ - bigbench|wino_x_german
+ - bigbench|winowhy
+ - bigbench|word_sorting
+ - bigbench|word_unscrambling
+
+- harness:
+ - harness|bbh:boolean_expressions
+ - harness|bbh:causal_judgment
+ - harness|bbh:date_understanding
+ - harness|bbh:disambiguation_qa
+ - harness|bbh:dyck_languages
+ - harness|bbh:formal_fallacies
+ - harness|bbh:geometric_shapes
+ - harness|bbh:hyperbaton
+ - harness|bbh:logical_deduction_five_objects
+ - harness|bbh:logical_deduction_seven_objects
+ - harness|bbh:logical_deduction_three_objects
+ - harness|bbh:movie_recommendation
+ - harness|bbh:multistep_arithmetic_two
+ - harness|bbh:navigate
+ - harness|bbh:object_counting
+ - harness|bbh:penguins_in_a_table
+ - harness|bbh:reasoning_about_colored_objects
+ - harness|bbh:ruin_names
+ - harness|bbh:salient_translation_error_detection
+ - harness|bbh:snarks
+ - harness|bbh:sports_understanding
+ - harness|bbh:temporal_sequences
+ - harness|bbh:tracking_shuffled_objects_five_objects
+ - harness|bbh:tracking_shuffled_objects_seven_objects
+ - harness|bbh:tracking_shuffled_objects_three_objects
+ - harness|bbh:web_of_lies
+ - harness|bbh:word_sorting
+ - harness|bigbench:causal_judgment
+ - harness|bigbench:date_understanding
+ - harness|bigbench:disambiguation_qa
+ - harness|bigbench:geometric_shapes
+ - harness|bigbench:logical_deduction_five_objects
+ - harness|bigbench:logical_deduction_seven_objects
+ - harness|bigbench:logical_deduction_three_objects
+ - harness|bigbench:movie_recommendation
+ - harness|bigbench:navigate
+ - harness|bigbench:reasoning_about_colored_objects
+ - harness|bigbench:ruin_names
+ - harness|bigbench:salient_translation_error_detection
+ - harness|bigbench:snarks
+ - harness|bigbench:sports_understanding
+ - harness|bigbench:temporal_sequences
+ - harness|bigbench:tracking_shuffled_objects_five_objects
+ - harness|bigbench:tracking_shuffled_objects_seven_objects
+ - harness|bigbench:tracking_shuffled_objects_three_objects
+ - harness|wikitext:103:document_level
+
+- helm:
+ - helm|babi_qa
+ - helm|bbq
+ - helm|bbq:Age
+ - helm|bbq:Disability_status
+ - helm|bbq:Gender_identity
+ - helm|bbq:Physical_appearance
+ - helm|bbq:Race_ethnicity
+ - helm|bbq:Race_x_SES
+ - helm|bbq:Race_x_gender
+ - helm|bbq:Religion
+ - helm|bbq:SES
+ - helm|bbq:Sexual_orientation
+ - helm|bbq=Nationality
+ - helm|bigbench:auto_debugging
+ - helm|bigbench:bbq_lite_json:age_ambig
+ - helm|bigbench:bbq_lite_json:age_disambig
+ - helm|bigbench:bbq_lite_json:disability_status_ambig
+ - helm|bigbench:bbq_lite_json:disability_status_disambig
+ - helm|bigbench:bbq_lite_json:gender_identity_ambig
+ - helm|bigbench:bbq_lite_json:gender_identity_disambig
+ - helm|bigbench:bbq_lite_json:nationality_ambig
+ - helm|bigbench:bbq_lite_json:nationality_disambig
+ - helm|bigbench:bbq_lite_json:physical_appearance_ambig
+ - helm|bigbench:bbq_lite_json:physical_appearance_disambig
+ - helm|bigbench:bbq_lite_json:race_ethnicity_ambig
+ - helm|bigbench:bbq_lite_json:race_ethnicity_disambig
+ - helm|bigbench:bbq_lite_json:religion_ambig
+ - helm|bigbench:bbq_lite_json:religion_disambig
+ - helm|bigbench:bbq_lite_json:ses_ambig
+ - helm|bigbench:bbq_lite_json:ses_disambig
+ - helm|bigbench:bbq_lite_json:sexual_orientation_ambig
+ - helm|bigbench:bbq_lite_json:sexual_orientation_disambig
+ - helm|bigbench:code_line_description
+ - helm|bigbench:conceptual_combinations:contradictions
+ - helm|bigbench:conceptual_combinations:emergent_properties
+ - helm|bigbench:conceptual_combinations:fanciful_fictional_combinations
+ - helm|bigbench:conceptual_combinations:homonyms
+ - helm|bigbench:conceptual_combinations:invented_words
+ - helm|bigbench:conlang_translation:adna_from
+ - helm|bigbench:conlang_translation:adna_to
+ - helm|bigbench:conlang_translation:atikampe_from
+ - helm|bigbench:conlang_translation:atikampe_to
+ - helm|bigbench:conlang_translation:gornam_from
+ - helm|bigbench:conlang_translation:gornam_to
+ - helm|bigbench:conlang_translation:holuan_from
+ - helm|bigbench:conlang_translation:holuan_to
+ - helm|bigbench:conlang_translation:mkafala_from
+ - helm|bigbench:conlang_translation:mkafala_to
+ - helm|bigbench:conlang_translation:postpositive_english_from
+ - helm|bigbench:conlang_translation:postpositive_english_to
+ - helm|bigbench:conlang_translation:unapuri_from
+ - helm|bigbench:conlang_translation:unapuri_to
+ - helm|bigbench:conlang_translation:vaomi_from
+ - helm|bigbench:conlang_translation:vaomi_to
+ - helm|bigbench:emoji_movie
+ - helm|bigbench:formal_fallacies_syllogisms_negation
+ - helm|bigbench:hindu_knowledge
+ - helm|bigbench:known_unknowns
+ - helm|bigbench:language_identification
+ - helm|bigbench:linguistics_puzzles
+ - helm|bigbench:logic_grid_puzzle
+ - helm|bigbench:logical_deduction-five_objects
+ - helm|bigbench:logical_deduction-seven_objects
+ - helm|bigbench:logical_deduction-three_objects
+ - helm|bigbench:misconceptions_russian
+ - helm|bigbench:novel_concepts
+ - helm|bigbench:operators
+ - helm|bigbench:parsinlu_reading_comprehension
+ - helm|bigbench:play_dialog_same_or_different
+ - helm|bigbench:repeat_copy_logic
+ - helm|bigbench:strange_stories-boolean
+ - helm|bigbench:strange_stories-multiple_choice
+ - helm|bigbench:strategyqa
+ - helm|bigbench:symbol_interpretation-adversarial
+ - helm|bigbench:symbol_interpretation-emoji_agnostic
+ - helm|bigbench:symbol_interpretation-name_agnostic
+ - helm|bigbench:symbol_interpretation-plain
+ - helm|bigbench:symbol_interpretation-tricky
+ - helm|bigbench:vitaminc_fact_verification
+ - helm|bigbench:winowhy
+ - helm|blimp:adjunct_island
+ - helm|blimp:anaphor_gender_agreement
+ - helm|blimp:anaphor_number_agreement
+ - helm|blimp:animate_subject_passive
+ - helm|blimp:animate_subject_trans
+ - helm|blimp:causative
+ - helm|blimp:complex_NP_island
+ - helm|blimp:coordinate_structure_constraint_complex_left_branch
+ - helm|blimp:coordinate_structure_constraint_object_extraction
+ - helm|blimp:determiner_noun_agreement_1
+ - helm|blimp:determiner_noun_agreement_2
+ - helm|blimp:determiner_noun_agreement_irregular_1
+ - helm|blimp:determiner_noun_agreement_irregular_2
+ - helm|blimp:determiner_noun_agreement_with_adj_2
+ - helm|blimp:determiner_noun_agreement_with_adj_irregular_1
+ - helm|blimp:determiner_noun_agreement_with_adj_irregular_2
+ - helm|blimp:determiner_noun_agreement_with_adjective_1
+ - helm|blimp:distractor_agreement_relational_noun
+ - helm|blimp:distractor_agreement_relative_clause
+ - helm|blimp:drop_argument
+ - helm|blimp:ellipsis_n_bar_1
+ - helm|blimp:ellipsis_n_bar_2
+ - helm|blimp:existential_there_object_raising
+ - helm|blimp:existential_there_quantifiers_1
+ - helm|blimp:existential_there_quantifiers_2
+ - helm|blimp:existential_there_subject_raising
+ - helm|blimp:expletive_it_object_raising
+ - helm|blimp:inchoative
+ - helm|blimp:intransitive
+ - helm|blimp:irregular_past_participle_adjectives
+ - helm|blimp:irregular_past_participle_verbs
+ - helm|blimp:irregular_plural_subject_verb_agreement_1
+ - helm|blimp:irregular_plural_subject_verb_agreement_2
+ - helm|blimp:left_branch_island_echo_question
+ - helm|blimp:left_branch_island_simple_question
+ - helm|blimp:matrix_question_npi_licensor_present
+ - helm|blimp:npi_present_1
+ - helm|blimp:npi_present_2
+ - helm|blimp:only_npi_licensor_present
+ - helm|blimp:only_npi_scope
+ - helm|blimp:passive_1
+ - helm|blimp:passive_2
+ - helm|blimp:principle_A_c_command
+ - helm|blimp:principle_A_case_1
+ - helm|blimp:principle_A_case_2
+ - helm|blimp:principle_A_domain_1
+ - helm|blimp:principle_A_domain_2
+ - helm|blimp:principle_A_domain_3
+ - helm|blimp:principle_A_reconstruction
+ - helm|blimp:regular_plural_subject_verb_agreement_1
+ - helm|blimp:regular_plural_subject_verb_agreement_2
+ - helm|blimp:sentential_negation_npi_licensor_present
+ - helm|blimp:sentential_negation_npi_scope
+ - helm|blimp:sentential_subject_island
+ - helm|blimp:superlative_quantifiers_1
+ - helm|blimp:superlative_quantifiers_2
+ - helm|blimp:tough_vs_raising_1
+ - helm|blimp:tough_vs_raising_2
+ - helm|blimp:transitive
+ - helm|blimp:wh_island
+ - helm|blimp:wh_questions_object_gap
+ - helm|blimp:wh_questions_subject_gap
+ - helm|blimp:wh_questions_subject_gap_long_distance
+ - helm|blimp:wh_vs_that_no_gap
+ - helm|blimp:wh_vs_that_no_gap_long_distance
+ - helm|blimp:wh_vs_that_with_gap
+ - helm|blimp:wh_vs_that_with_gap_long_distance
+ - helm|bold
+ - helm|bold:gender
+ - helm|bold:political_ideology
+ - helm|bold:profession
+ - helm|bold:race
+ - helm|bold:religious_ideology
+ - helm|boolq
+ - helm|boolq:contrastset
+ - helm|civil_comments
+ - helm|civil_comments:LGBTQ
+ - helm|civil_comments:black
+ - helm|civil_comments:christian
+ - helm|civil_comments:female
+ - helm|civil_comments:male
+ - helm|civil_comments:muslim
+ - helm|civil_comments:other_religions
+ - helm|civil_comments:white
+ - helm|commonsenseqa
+ - helm|copyright:n_books_1000-extractions_per_book_1-prefix_length_125
+ - helm|copyright:n_books_1000-extractions_per_book_1-prefix_length_25
+ - helm|copyright:n_books_1000-extractions_per_book_1-prefix_length_5
+ - helm|copyright:n_books_1000-extractions_per_book_3-prefix_length_125
+ - helm|copyright:n_books_1000-extractions_per_book_3-prefix_length_25
+ - helm|copyright:n_books_1000-extractions_per_book_3-prefix_length_5
+ - helm|copyright:oh_the_places
+ - helm|copyright:pilot
+ - helm|copyright:popular_books-prefix_length_10
+ - helm|copyright:popular_books-prefix_length_125
+ - helm|copyright:popular_books-prefix_length_25
+ - helm|copyright:popular_books-prefix_length_250
+ - helm|copyright:popular_books-prefix_length_5
+ - helm|copyright:popular_books-prefix_length_50
+ - helm|copyright:prompt_num_line_1-min_lines_20
+ - helm|copyright:prompt_num_line_10-min_lines_20
+ - helm|copyright:prompt_num_line_5-min_lines_20
+ - helm|covid_dialogue
+ - helm|dyck_language:2
+ - helm|dyck_language:3
+ - helm|dyck_language:4
+ - helm|entity_data_imputation:Buy
+ - helm|entity_data_imputation:Restaurant
+ - helm|entity_matching:Abt_Buy
+ - helm|entity_matching:Amazon_Google
+ - helm|entity_matching:Beer
+ - helm|entity_matching:Company
+ - helm|entity_matching:DBLP_ACM
+ - helm|entity_matching:DBLP_GoogleScholar
+ - helm|entity_matching:Dirty_DBLP_ACM
+ - helm|entity_matching:Dirty_DBLP_GoogleScholar
+ - helm|entity_matching:Dirty_Walmart_Amazon
+ - helm|entity_matching:Dirty_iTunes_Amazon
+ - helm|entity_matching:Walmart_Amazon
+ - helm|entity_matching:iTunes_Amazon
+ - helm|entity_matching=Fodors_Zagats
+ - helm|hellaswag
+ - helm|imdb
+ - helm|imdb:contrastset
+ - helm|interactive_qa_mmlu:abstract_algebra
+ - helm|interactive_qa_mmlu:college_chemistry
+ - helm|interactive_qa_mmlu:global_facts
+ - helm|interactive_qa_mmlu:miscellaneous
+ - helm|interactive_qa_mmlu:nutrition
+ - helm|interactive_qa_mmlu:us_foreign_policy
+ - helm|legal_summarization:billsum
+ - helm|legal_summarization:eurlexsum
+ - helm|legal_summarization:multilexsum
+ - helm|legalsupport
+ - helm|lexglue:case_hold
+ - helm|lexglue:ecthr_a
+ - helm|lexglue:ecthr_b
+ - helm|lexglue:eurlex
+ - helm|lexglue:ledgar
+ - helm|lexglue:scotus
+ - helm|lexglue:unfair_tos
+ - helm|lextreme:brazilian_court_decisions_judgment
+ - helm|lextreme:brazilian_court_decisions_unanimity
+ - helm|lextreme:covid19_emergency_event
+ - helm|lextreme:german_argument_mining
+ - helm|lextreme:greek_legal_code_chapter
+ - helm|lextreme:greek_legal_code_subject
+ - helm|lextreme:greek_legal_code_volume
+ - helm|lextreme:greek_legal_ner
+ - helm|lextreme:legalnero
+ - helm|lextreme:lener_br
+ - helm|lextreme:mapa_coarse
+ - helm|lextreme:mapa_fine
+ - helm|lextreme:multi_eurlex_level_1
+ - helm|lextreme:multi_eurlex_level_2
+ - helm|lextreme:multi_eurlex_level_3
+ - helm|lextreme:online_terms_of_service_clause_topics
+ - helm|lextreme:online_terms_of_service_unfairness_levels
+ - helm|lextreme:swiss_judgment_prediction
+ - helm|lsat_qa
+ - helm|lsat_qa:assignment
+ - helm|lsat_qa:grouping
+ - helm|lsat_qa:miscellaneous
+ - helm|lsat_qa:ordering
+ - helm|me_q_sum
+ - helm|med_dialog:healthcaremagic
+ - helm|med_dialog:icliniq
+ - helm|med_mcqa
+ - helm|med_paragraph_simplification
+ - helm|med_qa
+ - helm|mmlu
+ - helm|mmlu:abstract_algebra
+ - helm|mmlu:anatomy
+ - helm|mmlu:astronomy
+ - helm|mmlu:business_ethics
+ - helm|mmlu:clinical_knowledge
+ - helm|mmlu:college_biology
+ - helm|mmlu:college_chemistry
+ - helm|mmlu:college_computer_science
+ - helm|mmlu:college_mathematics
+ - helm|mmlu:college_medicine
+ - helm|mmlu:college_physics
+ - helm|mmlu:computer_security
+ - helm|mmlu:conceptual_physics
+ - helm|mmlu:econometrics
+ - helm|mmlu:electrical_engineering
+ - helm|mmlu:elementary_mathematics
+ - helm|mmlu:formal_logic
+ - helm|mmlu:global_facts
+ - helm|mmlu:high_school_biology
+ - helm|mmlu:high_school_chemistry
+ - helm|mmlu:high_school_computer_science
+ - helm|mmlu:high_school_european_history
+ - helm|mmlu:high_school_geography
+ - helm|mmlu:high_school_government_and_politics
+ - helm|mmlu:high_school_macroeconomics
+ - helm|mmlu:high_school_mathematics
+ - helm|mmlu:high_school_microeconomics
+ - helm|mmlu:high_school_physics
+ - helm|mmlu:high_school_psychology
+ - helm|mmlu:high_school_statistics
+ - helm|mmlu:high_school_us_history
+ - helm|mmlu:high_school_world_history
+ - helm|mmlu:human_aging
+ - helm|mmlu:human_sexuality
+ - helm|mmlu:international_law
+ - helm|mmlu:jurisprudence
+ - helm|mmlu:logical_fallacies
+ - helm|mmlu:machine_learning
+ - helm|mmlu:management
+ - helm|mmlu:marketing
+ - helm|mmlu:medical_genetics
+ - helm|mmlu:miscellaneous
+ - helm|mmlu:moral_disputes
+ - helm|mmlu:moral_scenarios
+ - helm|mmlu:nutrition
+ - helm|mmlu:philosophy
+ - helm|mmlu:prehistory
+ - helm|mmlu:professional_accounting
+ - helm|mmlu:professional_law
+ - helm|mmlu:professional_medicine
+ - helm|mmlu:professional_psychology
+ - helm|mmlu:public_relations
+ - helm|mmlu:security_studies
+ - helm|mmlu:sociology
+ - helm|mmlu:us_foreign_policy
+ - helm|mmlu:virology
+ - helm|mmlu:world_religions
+ - helm|narrativeqa
+ - helm|numeracy:linear_example
+ - helm|numeracy:linear_standard
+ - helm|numeracy:parabola_example
+ - helm|numeracy:parabola_standard
+ - helm|numeracy:paraboloid_example
+ - helm|numeracy:paraboloid_standard
+ - helm|numeracy:plane_example
+ - helm|numeracy:plane_standard
+ - helm|openbookqa
+ - helm|piqa
+ - helm|pubmedqa
+ - helm|quac
+ - helm|raft:ade_corpus_v2
+ - helm|raft:banking_77
+ - helm|raft:neurips_impact_statement_risks
+ - helm|raft:one_stop_english
+ - helm|raft:overruling
+ - helm|raft:semiconductor_org_types
+ - helm|raft:systematic_review_inclusion
+ - helm|raft:tai_safety_research
+ - helm|raft:terms_of_service
+ - helm|raft:tweet_eval_hate
+ - helm|raft:twitter_complaints
+ - helm|real_toxicity_prompts
+ - helm|siqa
+ - helm|summarization:cnn-dm
+ - helm|summarization:xsum
+ - helm|summarization:xsum-sampled
+ - helm|synthetic_reasoning:induction
+ - helm|synthetic_reasoning:natural_easy
+ - helm|synthetic_reasoning:natural_hard
+ - helm|synthetic_reasoning:pattern_match
+ - helm|synthetic_reasoning:variable_substitution
+ - helm|the_pile:arxiv
+ - helm|the_pile:bibliotik
+ - helm|the_pile:commoncrawl
+ - helm|the_pile:dm-mathematics
+ - helm|the_pile:enron
+ - helm|the_pile:europarl
+ - helm|the_pile:freelaw
+ - helm|the_pile:github
+ - helm|the_pile:gutenberg
+ - helm|the_pile:hackernews
+ - helm|the_pile:nih-exporter
+ - helm|the_pile:opensubtitles
+ - helm|the_pile:openwebtext2
+ - helm|the_pile:pubmed-abstracts
+ - helm|the_pile:pubmed-central
+ - helm|the_pile:stackexchange
+ - helm|the_pile:upsto
+ - helm|the_pile:wikipedia
+ - helm|the_pile:youtubesubtitles
+ - helm|truthfulqa
+ - helm|twitterAAE:aa
+ - helm|twitterAAE:white
+ - helm|wikifact:applies_to_jurisdiction
+ - helm|wikifact:atomic_number
+ - helm|wikifact:author
+ - helm|wikifact:award_received
+ - helm|wikifact:basic_form_of_government
+ - helm|wikifact:capital
+ - helm|wikifact:capital_of
+ - helm|wikifact:central_bank
+ - helm|wikifact:composer
+ - helm|wikifact:continent
+ - helm|wikifact:country
+ - helm|wikifact:country_of_citizenship
+ - helm|wikifact:country_of_origin
+ - helm|wikifact:creator
+ - helm|wikifact:currency
+ - helm|wikifact:defendant
+ - helm|wikifact:developer
+ - helm|wikifact:diplomatic_relation
+ - helm|wikifact:director
+ - helm|wikifact:discoverer_or_inventor
+ - helm|wikifact:drug_or_therapy_used_for_treatment
+ - helm|wikifact:educated_at
+ - helm|wikifact:electron_configuration
+ - helm|wikifact:employer
+ - helm|wikifact:field_of_work
+ - helm|wikifact:file_extension
+ - helm|wikifact:genetic_association
+ - helm|wikifact:genre
+ - helm|wikifact:has_part
+ - helm|wikifact:head_of_government
+ - helm|wikifact:head_of_state
+ - helm|wikifact:headquarters_location
+ - helm|wikifact:industry
+ - helm|wikifact:influenced_by
+ - helm|wikifact:instance_of
+ - helm|wikifact:instrument
+ - helm|wikifact:language_of_work_or_name
+ - helm|wikifact:languages_spoken_written_or_signed
+ - helm|wikifact:laws_applied
+ - helm|wikifact:located_in_the_administrative_territorial_entity
+ - helm|wikifact:location
+ - helm|wikifact:location_of_discovery
+ - helm|wikifact:location_of_formation
+ - helm|wikifact:majority_opinion_by
+ - helm|wikifact:manufacturer
+ - helm|wikifact:measured_physical_quantity
+ - helm|wikifact:medical_condition_treated
+ - helm|wikifact:member_of
+ - helm|wikifact:member_of_political_party
+ - helm|wikifact:member_of_sports_team
+ - helm|wikifact:movement
+ - helm|wikifact:named_after
+ - helm|wikifact:native_language
+ - helm|wikifact:number_of_processor_cores
+ - helm|wikifact:occupation
+ - helm|wikifact:office_held_by_head_of_government
+ - helm|wikifact:office_held_by_head_of_state
+ - helm|wikifact:official_language
+ - helm|wikifact:operating_system
+ - helm|wikifact:original_language_of_film_or_TV_show
+ - helm|wikifact:original_network
+ - helm|wikifact:overrules
+ - helm|wikifact:owned_by
+ - helm|wikifact:part_of
+ - helm|wikifact:participating_team
+ - helm|wikifact:place_of_birth
+ - helm|wikifact:place_of_death
+ - helm|wikifact:plaintiff
+ - helm|wikifact:position_held
+ - helm|wikifact:position_played_on_team
+ - helm|wikifact:programming_language
+ - helm|wikifact:recommended_unit_of_measurement
+ - helm|wikifact:record_label
+ - helm|wikifact:religion
+ - helm|wikifact:repealed_by
+ - helm|wikifact:shares_border_with
+ - helm|wikifact:solved_by
+ - helm|wikifact:statement_describes
+ - helm|wikifact:stock_exchange
+ - helm|wikifact:subclass_of
+ - helm|wikifact:subsidiary
+ - helm|wikifact:symptoms_and_signs
+ - helm|wikifact:therapeutic_area
+ - helm|wikifact:time_of_discovery_or_invention
+ - helm|wikifact:twinned_administrative_body
+ - helm|wikifact:work_location
+ - helm|wikitext:103:document_level
+ - helm|wmt14:cs-en
+ - helm|wmt14:de-en
+ - helm|wmt14:fr-en
+ - helm|wmt14:hi-en
+ - helm|wmt14:ru-en
+
+- leaderboard:
+ - leaderboard|arc:challenge
+ - leaderboard|gsm8k
+ - leaderboard|hellaswag
+ - leaderboard|mmlu:abstract_algebra
+ - leaderboard|mmlu:anatomy
+ - leaderboard|mmlu:astronomy
+ - leaderboard|mmlu:business_ethics
+ - leaderboard|mmlu:clinical_knowledge
+ - leaderboard|mmlu:college_biology
+ - leaderboard|mmlu:college_chemistry
+ - leaderboard|mmlu:college_computer_science
+ - leaderboard|mmlu:college_mathematics
+ - leaderboard|mmlu:college_medicine
+ - leaderboard|mmlu:college_physics
+ - leaderboard|mmlu:computer_security
+ - leaderboard|mmlu:conceptual_physics
+ - leaderboard|mmlu:econometrics
+ - leaderboard|mmlu:electrical_engineering
+ - leaderboard|mmlu:elementary_mathematics
+ - leaderboard|mmlu:formal_logic
+ - leaderboard|mmlu:global_facts
+ - leaderboard|mmlu:high_school_biology
+ - leaderboard|mmlu:high_school_chemistry
+ - leaderboard|mmlu:high_school_computer_science
+ - leaderboard|mmlu:high_school_european_history
+ - leaderboard|mmlu:high_school_geography
+ - leaderboard|mmlu:high_school_government_and_politics
+ - leaderboard|mmlu:high_school_macroeconomics
+ - leaderboard|mmlu:high_school_mathematics
+ - leaderboard|mmlu:high_school_microeconomics
+ - leaderboard|mmlu:high_school_physics
+ - leaderboard|mmlu:high_school_psychology
+ - leaderboard|mmlu:high_school_statistics
+ - leaderboard|mmlu:high_school_us_history
+ - leaderboard|mmlu:high_school_world_history
+ - leaderboard|mmlu:human_aging
+ - leaderboard|mmlu:human_sexuality
+ - leaderboard|mmlu:international_law
+ - leaderboard|mmlu:jurisprudence
+ - leaderboard|mmlu:logical_fallacies
+ - leaderboard|mmlu:machine_learning
+ - leaderboard|mmlu:management
+ - leaderboard|mmlu:marketing
+ - leaderboard|mmlu:medical_genetics
+ - leaderboard|mmlu:miscellaneous
+ - leaderboard|mmlu:moral_disputes
+ - leaderboard|mmlu:moral_scenarios
+ - leaderboard|mmlu:nutrition
+ - leaderboard|mmlu:philosophy
+ - leaderboard|mmlu:prehistory
+ - leaderboard|mmlu:professional_accounting
+ - leaderboard|mmlu:professional_law
+ - leaderboard|mmlu:professional_medicine
+ - leaderboard|mmlu:professional_psychology
+ - leaderboard|mmlu:public_relations
+ - leaderboard|mmlu:security_studies
+ - leaderboard|mmlu:sociology
+ - leaderboard|mmlu:us_foreign_policy
+ - leaderboard|mmlu:virology
+ - leaderboard|mmlu:world_religions
+ - leaderboard|truthfulqa:mc
+ - leaderboard|winogrande
+
+- lighteval:
+ - lighteval|agieval:aqua-rat
+ - lighteval|agieval:gaokao-biology
+ - lighteval|agieval:gaokao-chemistry
+ - lighteval|agieval:gaokao-chinese
+ - lighteval|agieval:gaokao-english
+ - lighteval|agieval:gaokao-geography
+ - lighteval|agieval:gaokao-history
+ - lighteval|agieval:gaokao-mathqa
+ - lighteval|agieval:gaokao-physics
+ - lighteval|agieval:logiqa-en
+ - lighteval|agieval:logiqa-zh
+ - lighteval|agieval:lsat-ar
+ - lighteval|agieval:lsat-lr
+ - lighteval|agieval:lsat-rc
+ - lighteval|agieval:sat-en
+ - lighteval|agieval:sat-en-without-passage
+ - lighteval|agieval:sat-math
+ - lighteval|anli
+ - lighteval|anli:r1
+ - lighteval|anli:r2
+ - lighteval|anli:r3
+ - lighteval|arc:easy
+ - lighteval|arithmetic:1dc
+ - lighteval|arithmetic:2da
+ - lighteval|arithmetic:2dm
+ - lighteval|arithmetic:2ds
+ - lighteval|arithmetic:3da
+ - lighteval|arithmetic:3ds
+ - lighteval|arithmetic:4da
+ - lighteval|arithmetic:4ds
+ - lighteval|arithmetic:5da
+ - lighteval|arithmetic:5ds
+ - lighteval|asdiv
+ - lighteval|bigbench:causal_judgment
+ - lighteval|bigbench:date_understanding
+ - lighteval|bigbench:disambiguation_qa
+ - lighteval|bigbench:geometric_shapes
+ - lighteval|bigbench:logical_deduction_five_objects
+ - lighteval|bigbench:logical_deduction_seven_objects
+ - lighteval|bigbench:logical_deduction_three_objects
+ - lighteval|bigbench:movie_recommendation
+ - lighteval|bigbench:navigate
+ - lighteval|bigbench:reasoning_about_colored_objects
+ - lighteval|bigbench:ruin_names
+ - lighteval|bigbench:salient_translation_error_detection
+ - lighteval|bigbench:snarks
+ - lighteval|bigbench:sports_understanding
+ - lighteval|bigbench:temporal_sequences
+ - lighteval|bigbench:tracking_shuffled_objects_five_objects
+ - lighteval|bigbench:tracking_shuffled_objects_seven_objects
+ - lighteval|bigbench:tracking_shuffled_objects_three_objects
+ - lighteval|blimp:adjunct_island
+ - lighteval|blimp:anaphor_gender_agreement
+ - lighteval|blimp:anaphor_number_agreement
+ - lighteval|blimp:animate_subject_passive
+ - lighteval|blimp:animate_subject_trans
+ - lighteval|blimp:causative
+ - lighteval|blimp:complex_NP_island
+ - lighteval|blimp:coordinate_structure_constraint_complex_left_branch
+ - lighteval|blimp:coordinate_structure_constraint_object_extraction
+ - lighteval|blimp:determiner_noun_agreement_1
+ - lighteval|blimp:determiner_noun_agreement_2
+ - lighteval|blimp:determiner_noun_agreement_irregular_1
+ - lighteval|blimp:determiner_noun_agreement_irregular_2
+ - lighteval|blimp:determiner_noun_agreement_with_adj_2
+ - lighteval|blimp:determiner_noun_agreement_with_adj_irregular_1
+ - lighteval|blimp:determiner_noun_agreement_with_adj_irregular_2
+ - lighteval|blimp:determiner_noun_agreement_with_adjective_1
+ - lighteval|blimp:distractor_agreement_relational_noun
+ - lighteval|blimp:distractor_agreement_relative_clause
+ - lighteval|blimp:drop_argument
+ - lighteval|blimp:ellipsis_n_bar_1
+ - lighteval|blimp:ellipsis_n_bar_2
+ - lighteval|blimp:existential_there_object_raising
+ - lighteval|blimp:existential_there_quantifiers_1
+ - lighteval|blimp:existential_there_quantifiers_2
+ - lighteval|blimp:existential_there_subject_raising
+ - lighteval|blimp:expletive_it_object_raising
+ - lighteval|blimp:inchoative
+ - lighteval|blimp:intransitive
+ - lighteval|blimp:irregular_past_participle_adjectives
+ - lighteval|blimp:irregular_past_participle_verbs
+ - lighteval|blimp:irregular_plural_subject_verb_agreement_1
+ - lighteval|blimp:irregular_plural_subject_verb_agreement_2
+ - lighteval|blimp:left_branch_island_echo_question
+ - lighteval|blimp:left_branch_island_simple_question
+ - lighteval|blimp:matrix_question_npi_licensor_present
+ - lighteval|blimp:npi_present_1
+ - lighteval|blimp:npi_present_2
+ - lighteval|blimp:only_npi_licensor_present
+ - lighteval|blimp:only_npi_scope
+ - lighteval|blimp:passive_1
+ - lighteval|blimp:passive_2
+ - lighteval|blimp:principle_A_c_command
+ - lighteval|blimp:principle_A_case_1
+ - lighteval|blimp:principle_A_case_2
+ - lighteval|blimp:principle_A_domain_1
+ - lighteval|blimp:principle_A_domain_2
+ - lighteval|blimp:principle_A_domain_3
+ - lighteval|blimp:principle_A_reconstruction
+ - lighteval|blimp:regular_plural_subject_verb_agreement_1
+ - lighteval|blimp:regular_plural_subject_verb_agreement_2
+ - lighteval|blimp:sentential_negation_npi_licensor_present
+ - lighteval|blimp:sentential_negation_npi_scope
+ - lighteval|blimp:sentential_subject_island
+ - lighteval|blimp:superlative_quantifiers_1
+ - lighteval|blimp:superlative_quantifiers_2
+ - lighteval|blimp:tough_vs_raising_1
+ - lighteval|blimp:tough_vs_raising_2
+ - lighteval|blimp:transitive
+ - lighteval|blimp:wh_island
+ - lighteval|blimp:wh_questions_object_gap
+ - lighteval|blimp:wh_questions_subject_gap
+ - lighteval|blimp:wh_questions_subject_gap_long_distance
+ - lighteval|blimp:wh_vs_that_no_gap
+ - lighteval|blimp:wh_vs_that_no_gap_long_distance
+ - lighteval|blimp:wh_vs_that_with_gap
+ - lighteval|blimp:wh_vs_that_with_gap_long_distance
+ - lighteval|coqa
+ - lighteval|coqa_bb
+ - lighteval|drop
+ - lighteval|ethics:commonsense
+ - lighteval|ethics:deontology
+ - lighteval|ethics:justice
+ - lighteval|ethics:utilitarianism
+ - lighteval|ethics:virtue
+ - lighteval|glue:cola
+ - lighteval|glue:mnli
+ - lighteval|glue:mnli_mismatched
+ - lighteval|glue:mrpc
+ - lighteval|glue:qnli
+ - lighteval|glue:qqp
+ - lighteval|glue:rte
+ - lighteval|glue:sst2
+ - lighteval|glue:stsb
+ - lighteval|glue:wnli
+ - lighteval|gpqa
+ - lighteval|gsm8k
+ - lighteval|headqa:en
+ - lighteval|headqa:es
+ - lighteval|iwslt17:ar-en
+ - lighteval|iwslt17:de-en
+ - lighteval|iwslt17:en-ar
+ - lighteval|iwslt17:en-de
+ - lighteval|iwslt17:en-fr
+ - lighteval|iwslt17:en-ja
+ - lighteval|iwslt17:en-ko
+ - lighteval|iwslt17:en-zh
+ - lighteval|iwslt17:fr-en
+ - lighteval|iwslt17:ja-en
+ - lighteval|iwslt17:ko-en
+ - lighteval|iwslt17:zh-en
+ - lighteval|lambada:openai
+ - lighteval|lambada:openai:de
+ - lighteval|lambada:openai:en
+ - lighteval|lambada:openai:es
+ - lighteval|lambada:openai:fr
+ - lighteval|lambada:openai:it
+ - lighteval|lambada:openai_cloze
+ - lighteval|lambada:standard
+ - lighteval|lambada:standard_cloze
+ - lighteval|logiqa
+ - lighteval|math:algebra
+ - lighteval|math:counting_and_probability
+ - lighteval|math:geometry
+ - lighteval|math:intermediate_algebra
+ - lighteval|math:number_theory
+ - lighteval|math:prealgebra
+ - lighteval|math:precalculus
+ - lighteval|math_cot:algebra
+ - lighteval|math_cot:counting_and_probability
+ - lighteval|math_cot:geometry
+ - lighteval|math_cot:intermediate_algebra
+ - lighteval|math_cot:number_theory
+ - lighteval|math_cot:prealgebra
+ - lighteval|math_cot:precalculus
+ - lighteval|mathqa
+ - lighteval|mgsm:bn
+ - lighteval|mgsm:de
+ - lighteval|mgsm:en
+ - lighteval|mgsm:es
+ - lighteval|mgsm:fr
+ - lighteval|mgsm:ja
+ - lighteval|mgsm:ru
+ - lighteval|mgsm:sw
+ - lighteval|mgsm:te
+ - lighteval|mgsm:th
+ - lighteval|mgsm:zh
+ - lighteval|mtnt2019:en-fr
+ - lighteval|mtnt2019:en-ja
+ - lighteval|mtnt2019:fr-en
+ - lighteval|mtnt2019:ja-en
+ - lighteval|mutual
+ - lighteval|mutual_plus
+ - lighteval|openbookqa
+ - lighteval|piqa
+ - lighteval|prost
+ - lighteval|pubmedqa
+ - lighteval|qa4mre:2011
+ - lighteval|qa4mre:2012
+ - lighteval|qa4mre:2013
+ - lighteval|qasper
+ - lighteval|qasper_ll
+ - lighteval|race:high
+ - lighteval|sciq
+ - lighteval|storycloze:2016
+ - lighteval|storycloze:2018
+ - lighteval|super_glue:boolq
+ - lighteval|super_glue:cb
+ - lighteval|super_glue:copa
+ - lighteval|super_glue:multirc
+ - lighteval|super_glue:rte
+ - lighteval|super_glue:wic
+ - lighteval|super_glue:wsc
+ - lighteval|swag
+ - lighteval|the_pile:arxiv
+ - lighteval|the_pile:bookcorpus2
+ - lighteval|the_pile:books3
+ - lighteval|the_pile:dm-mathematics
+ - lighteval|the_pile:enron
+ - lighteval|the_pile:europarl
+ - lighteval|the_pile:freelaw
+ - lighteval|the_pile:github
+ - lighteval|the_pile:gutenberg
+ - lighteval|the_pile:hackernews
+ - lighteval|the_pile:nih-exporter
+ - lighteval|the_pile:opensubtitles
+ - lighteval|the_pile:openwebtext2
+ - lighteval|the_pile:philpapers
+ - lighteval|the_pile:pile-cc
+ - lighteval|the_pile:pubmed-abstracts
+ - lighteval|the_pile:pubmed-central
+ - lighteval|the_pile:stackexchange
+ - lighteval|the_pile:ubuntu-irc
+ - lighteval|the_pile:uspto
+ - lighteval|the_pile:wikipedia
+ - lighteval|the_pile:youtubesubtitles
+ - lighteval|toxigen
+ - lighteval|triviaqa
+ - lighteval|truthfulqa:gen
+ - lighteval|unscramble:anagrams1
+ - lighteval|unscramble:anagrams2
+ - lighteval|unscramble:cycle_letters
+ - lighteval|unscramble:random_insertion
+ - lighteval|unscramble:reversed_words
+ - lighteval|webqs
+ - lighteval|wikitext:2
+ - lighteval|wmt08:cs-en
+ - lighteval|wmt08:de-en
+ - lighteval|wmt08:en-cs
+ - lighteval|wmt08:en-de
+ - lighteval|wmt08:en-es
+ - lighteval|wmt08:en-fr
+ - lighteval|wmt08:en-hu
+ - lighteval|wmt08:es-en
+ - lighteval|wmt08:fr-en
+ - lighteval|wmt08:hu-en
+ - lighteval|wmt09:cs-en
+ - lighteval|wmt09:de-en
+ - lighteval|wmt09:en-cs
+ - lighteval|wmt09:en-de
+ - lighteval|wmt09:en-es
+ - lighteval|wmt09:en-fr
+ - lighteval|wmt09:en-hu
+ - lighteval|wmt09:en-it
+ - lighteval|wmt09:es-en
+ - lighteval|wmt09:fr-en
+ - lighteval|wmt09:hu-en
+ - lighteval|wmt09:it-en
+ - lighteval|wmt10:cs-en
+ - lighteval|wmt10:de-en
+ - lighteval|wmt10:en-cs
+ - lighteval|wmt10:en-de
+ - lighteval|wmt10:en-es
+ - lighteval|wmt10:en-fr
+ - lighteval|wmt10:es-en
+ - lighteval|wmt10:fr-en
+ - lighteval|wmt11:cs-en
+ - lighteval|wmt11:de-en
+ - lighteval|wmt11:en-cs
+ - lighteval|wmt11:en-de
+ - lighteval|wmt11:en-es
+ - lighteval|wmt11:en-fr
+ - lighteval|wmt11:es-en
+ - lighteval|wmt11:fr-en
+ - lighteval|wmt12:cs-en
+ - lighteval|wmt12:de-en
+ - lighteval|wmt12:en-cs
+ - lighteval|wmt12:en-de
+ - lighteval|wmt12:en-es
+ - lighteval|wmt12:en-fr
+ - lighteval|wmt12:es-en
+ - lighteval|wmt12:fr-en
+ - lighteval|wmt13:cs-en
+ - lighteval|wmt13:de-en
+ - lighteval|wmt13:en-cs
+ - lighteval|wmt13:en-de
+ - lighteval|wmt13:en-es
+ - lighteval|wmt13:en-fr
+ - lighteval|wmt13:en-ru
+ - lighteval|wmt13:es-en
+ - lighteval|wmt13:fr-en
+ - lighteval|wmt13:ru-en
+ - lighteval|wmt14:cs-en
+ - lighteval|wmt14:de-en
+ - lighteval|wmt14:en-cs
+ - lighteval|wmt14:en-de
+ - lighteval|wmt14:en-fr
+ - lighteval|wmt14:en-hi
+ - lighteval|wmt14:en-ru
+ - lighteval|wmt14:fr-en
+ - lighteval|wmt14:hi-en
+ - lighteval|wmt14:ru-en
+ - lighteval|wmt15:cs-en
+ - lighteval|wmt15:de-en
+ - lighteval|wmt15:en-cs
+ - lighteval|wmt15:en-de
+ - lighteval|wmt15:en-fi
+ - lighteval|wmt15:en-fr
+ - lighteval|wmt15:en-ru
+ - lighteval|wmt15:fi-en
+ - lighteval|wmt15:fr-en
+ - lighteval|wmt15:ru-en
+ - lighteval|wmt16:cs-en
+ - lighteval|wmt16:de-en
+ - lighteval|wmt16:en-cs
+ - lighteval|wmt16:en-de
+ - lighteval|wmt16:en-fi
+ - lighteval|wmt16:en-ro
+ - lighteval|wmt16:en-ru
+ - lighteval|wmt16:en-tr
+ - lighteval|wmt16:fi-en
+ - lighteval|wmt16:ro-en
+ - lighteval|wmt16:ru-en
+ - lighteval|wmt16:tr-en
+ - lighteval|wmt17:cs-en
+ - lighteval|wmt17:de-en
+ - lighteval|wmt17:en-cs
+ - lighteval|wmt17:en-de
+ - lighteval|wmt17:en-fi
+ - lighteval|wmt17:en-lv
+ - lighteval|wmt17:en-ru
+ - lighteval|wmt17:en-tr
+ - lighteval|wmt17:en-zh
+ - lighteval|wmt17:fi-en
+ - lighteval|wmt17:lv-en
+ - lighteval|wmt17:ru-en
+ - lighteval|wmt17:tr-en
+ - lighteval|wmt17:zh-en
+ - lighteval|wmt18:cs-en
+ - lighteval|wmt18:de-en
+ - lighteval|wmt18:en-cs
+ - lighteval|wmt18:en-de
+ - lighteval|wmt18:en-et
+ - lighteval|wmt18:en-fi
+ - lighteval|wmt18:en-ru
+ - lighteval|wmt18:en-tr
+ - lighteval|wmt18:en-zh
+ - lighteval|wmt18:et-en
+ - lighteval|wmt18:fi-en
+ - lighteval|wmt18:ru-en
+ - lighteval|wmt18:tr-en
+ - lighteval|wmt18:zh-en
+ - lighteval|wmt19:cs-de
+ - lighteval|wmt19:de-cs
+ - lighteval|wmt19:de-en
+ - lighteval|wmt19:de-fr
+ - lighteval|wmt19:en-cs
+ - lighteval|wmt19:en-de
+ - lighteval|wmt19:en-fi
+ - lighteval|wmt19:en-gu
+ - lighteval|wmt19:en-kk
+ - lighteval|wmt19:en-lt
+ - lighteval|wmt19:en-ru
+ - lighteval|wmt19:en-zh
+ - lighteval|wmt19:fi-en
+ - lighteval|wmt19:fr-de
+ - lighteval|wmt19:gu-en
+ - lighteval|wmt19:kk-en
+ - lighteval|wmt19:lt-en
+ - lighteval|wmt19:ru-en
+ - lighteval|wmt19:zh-en
+ - lighteval|wmt20:cs-en
+ - lighteval|wmt20:de-en
+ - lighteval|wmt20:de-fr
+ - lighteval|wmt20:en-cs
+ - lighteval|wmt20:en-de
+ - lighteval|wmt20:en-iu
+ - lighteval|wmt20:en-ja
+ - lighteval|wmt20:en-km
+ - lighteval|wmt20:en-pl
+ - lighteval|wmt20:en-ps
+ - lighteval|wmt20:en-ru
+ - lighteval|wmt20:en-ta
+ - lighteval|wmt20:en-zh
+ - lighteval|wmt20:fr-de
+ - lighteval|wmt20:iu-en
+ - lighteval|wmt20:ja-en
+ - lighteval|wmt20:km-en
+ - lighteval|wmt20:pl-en
+ - lighteval|wmt20:ps-en
+ - lighteval|wmt20:ru-en
+ - lighteval|wmt20:ta-en
+ - lighteval|wmt20:zh-en
+ - lighteval|wsc273
+ - lighteval|xcopa:en
+ - lighteval|xcopa:et
+ - lighteval|xcopa:ht
+ - lighteval|xcopa:id
+ - lighteval|xcopa:it
+ - lighteval|xcopa:qu
+ - lighteval|xcopa:sw
+ - lighteval|xcopa:ta
+ - lighteval|xcopa:th
+ - lighteval|xcopa:tr
+ - lighteval|xcopa:vi
+ - lighteval|xcopa:zh
+ - lighteval|xstory_cloze:ar
+ - lighteval|xstory_cloze:en
+ - lighteval|xstory_cloze:es
+ - lighteval|xstory_cloze:eu
+ - lighteval|xstory_cloze:hi
+ - lighteval|xstory_cloze:id
+ - lighteval|xstory_cloze:my
+ - lighteval|xstory_cloze:ru
+ - lighteval|xstory_cloze:sw
+ - lighteval|xstory_cloze:te
+ - lighteval|xstory_cloze:zh
+ - lighteval|xwinograd:en
+ - lighteval|xwinograd:fr
+ - lighteval|xwinograd:jp
+ - lighteval|xwinograd:pt
+ - lighteval|xwinograd:ru
+ - lighteval|xwinograd:zh
+
+- original:
+ - original|arc:c:letters
+ - original|arc:c:options
+ - original|arc:c:simple
+ - original|mmlu
+ - original|mmlu:abstract_algebra
+ - original|mmlu:anatomy
+ - original|mmlu:astronomy
+ - original|mmlu:business_ethics
+ - original|mmlu:clinical_knowledge
+ - original|mmlu:college_biology
+ - original|mmlu:college_chemistry
+ - original|mmlu:college_computer_science
+ - original|mmlu:college_mathematics
+ - original|mmlu:college_medicine
+ - original|mmlu:college_physics
+ - original|mmlu:computer_security
+ - original|mmlu:conceptual_physics
+ - original|mmlu:econometrics
+ - original|mmlu:electrical_engineering
+ - original|mmlu:elementary_mathematics
+ - original|mmlu:formal_logic
+ - original|mmlu:global_facts
+ - original|mmlu:high_school_biology
+ - original|mmlu:high_school_chemistry
+ - original|mmlu:high_school_computer_science
+ - original|mmlu:high_school_european_history
+ - original|mmlu:high_school_geography
+ - original|mmlu:high_school_government_and_politics
+ - original|mmlu:high_school_macroeconomics
+ - original|mmlu:high_school_mathematics
+ - original|mmlu:high_school_microeconomics
+ - original|mmlu:high_school_physics
+ - original|mmlu:high_school_psychology
+ - original|mmlu:high_school_statistics
+ - original|mmlu:high_school_us_history
+ - original|mmlu:high_school_world_history
+ - original|mmlu:human_aging
+ - original|mmlu:human_sexuality
+ - original|mmlu:international_law
+ - original|mmlu:jurisprudence
+ - original|mmlu:logical_fallacies
+ - original|mmlu:machine_learning
+ - original|mmlu:management
+ - original|mmlu:marketing
+ - original|mmlu:medical_genetics
+ - original|mmlu:miscellaneous
+ - original|mmlu:moral_disputes
+ - original|mmlu:moral_scenarios
+ - original|mmlu:nutrition
+ - original|mmlu:philosophy
+ - original|mmlu:prehistory
+ - original|mmlu:professional_accounting
+ - original|mmlu:professional_law
+ - original|mmlu:professional_medicine
+ - original|mmlu:professional_psychology
+ - original|mmlu:public_relations
+ - original|mmlu:security_studies
+ - original|mmlu:sociology
+ - original|mmlu:us_foreign_policy
+ - original|mmlu:virology
+ - original|mmlu:world_religions
diff --git a/docs/source/contributing-to-multilingual-evaluations.mdx b/docs/source/contributing-to-multilingual-evaluations.mdx
new file mode 100644
index 000000000..25779bc38
--- /dev/null
+++ b/docs/source/contributing-to-multilingual-evaluations.mdx
@@ -0,0 +1,107 @@
+# Contributing to multilingual evaluations
+
+## Contributing a small translation
+
+We define 19 `literals`, basic keywords or punctuation signs used when creating evaluation prompts in an automatic manner, such as `yes`, `no`, `because`, etc.
+
+We welcome translations in your language!
+
+To contribute, you'll need to
+1. Open the [translation_literals](https://github.com/huggingface/lighteval/blob/main/src/lighteval/tasks/templates/utils/translation_literals.py) file
+2. Edit the file to add or expand the literal for your language of interest.
+
+```python
+ Language.ENGLISH: TranslationLiterals(
+ language=Language.ENGLISH,
+ question_word="question", # Usage: "Question: How are you?"
+ answer="answer", # Usage: "Answer: I am fine"
+ confirmation_word="right", # Usage: "He is smart, right?"
+ yes="yes", # Usage: "Yes, he is"
+ no="no", # Usage: "No, he is not"
+ also="also", # Usage: "Also, she is smart."
+ cause_word="because", # Usage: "She is smart, because she is tall"
+ effect_word="therefore", # Usage: "He is tall therefore he is smart"
+ or_word="or", # Usage: "He is tall or small"
+ true="true", # Usage: "He is smart, true, false or neither?"
+ false="false", # Usage: "He is smart, true, false or neither?"
+ neither="neither", # Usage: "He is smart, true, false or neither?"
+ # Punctuation and spacing: only adjust if your language uses something different than in English
+ full_stop=".",
+ comma=",",
+ question_mark="?",
+ exclamation_mark="!",
+ word_space=" ",
+ sentence_space=" ",
+ colon=":",
+ # The first characters of your alphabet used in enumerations, if different from English
+ indices=["A", "B", "C", ...]
+ )
+```
+
+3. Open a PR with your modifications! And voilĂ !
+
+## Contributing a new multilingual task
+
+You should first read our guide on [adding a custom task](adding-a-custom-task), to better understand the different parameters we use.
+
+Then, you should take a look at the current [multilingual tasks](https://github.com/huggingface/lighteval/blob/main/src/lighteval/tasks/multilingual/tasks.py) file, to understand how they are defined. For multilingual evaluations the `prompt_function` should be implemented by language-adapted template. The template will take care of correct formatting, correct and consistent usage of language adjusted prompt anchors (e.g Question/Answer) and punctuation.
+
+Browse the list of all templates [here](https://github.com/huggingface/lighteval/tree/main/src/lighteval/tasks/templates) to see which are the most adapted to your own task.
+
+Then, when ready, to define your own task, you should:
+1. create a Python file as indicated in the above guide
+2. import the relevant templates for your task type (XNLI, Copa, Multiple choice, Question Answering, etc)
+3. define one or a list of tasks for each relevant language and evaluation formulation (for multichoice) using our parametrizable `LightevalTaskConfig` class
+
+```python
+your_tasks = [
+ LightevalTaskConfig(
+ # Name of your evaluation
+ name=f"evalname_{language.value}_{formulation.name.lower()}",
+ # The evaluation is community contributed
+ suite=["community"],
+ # This will automatically get the correct metrics for your chosen formulation
+ metric=get_metrics_for_formulation(
+ formulation,
+ [
+ loglikelihood_acc_metric(normalization=None),
+ loglikelihood_acc_metric(normalization=LogProbTokenNorm()),
+ loglikelihood_acc_metric(normalization=LogProbCharNorm()),
+ ],
+ ),
+ # In this function, you choose which template to follow and for which language and formulation
+ prompt_function=get_template_prompt_function(
+ language=language,
+ # then use the adapter to define the mapping between the
+ # keys of the template (left), and the keys of your dataset
+ # (right)
+ # To know which template keys are required and available,
+ # consult the appropriate adapter type and doc-string.
+ adapter=lambda line: {
+ "key": line["relevant_key"],
+ ...
+ },
+ formulation=formulation,
+ ),
+ # You can also add specific filters to remove irrelevant samples
+ hf_filter=lambda line: line["label"] in ,
+ # You then select your huggingface dataset as well as
+ # the splits available for evaluation
+ hf_repo=,
+ hf_subset=,
+ evaluation_splits=["train"],
+ hf_avail_splits=["train"],
+ )
+ for language in [
+ Language.YOUR_LANGUAGE, ...
+ ]
+ for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
+]
+```
+4. then, you can go back to the guide to test if your task is correctly implemented!
+
+> [!TIP]
+> All `LightevalTaskConfig` parameters are strongly typed, including the inputs to the template function. Make sure to take advantage of your IDE's functionality to make it easier to correctly fill these parameters.
+
+
+Once everything is good, open a PR, and we'll be happy to review it!
\ No newline at end of file
diff --git a/docs/source/evaluate-the-model-on-a-server-or-container.mdx b/docs/source/evaluate-the-model-on-a-server-or-container.mdx
new file mode 100644
index 000000000..da8f1d4b7
--- /dev/null
+++ b/docs/source/evaluate-the-model-on-a-server-or-container.mdx
@@ -0,0 +1,67 @@
+# Evaluate the model on a server or container
+
+An alternative to launching the evaluation locally is to serve the model on a
+TGI-compatible server/container and then run the evaluation by sending requests
+to the server. The command is the same as before, except you specify a path to
+a yaml config file (detailed below):
+
+```bash
+lighteval accelerate \
+ --model_config_path="/path/to/config/file"\
+ --tasks \
+ --output_dir output_dir
+```
+
+There are two types of configuration files that can be provided for running on
+the server:
+
+### Hugging Face Inference Endpoints
+
+To launch a model using HuggingFace's Inference Endpoints, you need to provide
+the following file: `endpoint_model.yaml`. Lighteval will automatically deploy
+the endpoint, run the evaluation, and finally delete the endpoint (unless you
+specify an endpoint that was already launched, in which case the endpoint won't
+be deleted afterwards).
+
+__configuration file example:__
+
+```yaml
+model:
+ type: "endpoint"
+ base_params:
+ endpoint_name: "llama-2-7B-lighteval" # needs to be lower case without special characters
+ model: "meta-llama/Llama-2-7b-hf"
+ revision: "main"
+ dtype: "float16" # can be any of "awq", "eetq", "gptq", "4bit' or "8bit" (will use bitsandbytes), "bfloat16" or "float16"
+ reuse_existing: false # if true, ignore all params in instance, and don't delete the endpoint after evaluation
+ instance:
+ accelerator: "gpu"
+ region: "eu-west-1"
+ vendor: "aws"
+ instance_size: "medium"
+ instance_type: "g5.2xlarge"
+ framework: "pytorch"
+ endpoint_type: "protected"
+ namespace: null # The namespace under which to launch the endopint. Defaults to the current user's namespace
+ image_url: null # Optionally specify the docker image to use when launching the endpoint model. E.g., launching models with later releases of the TGI container with support for newer models.
+ env_vars:
+ null # Optional environment variables to include when launching the endpoint. e.g., `MAX_INPUT_LENGTH: 2048`
+ generation:
+ add_special_tokens: true
+```
+
+### Text Generation Inference (TGI)
+
+To use a model already deployed on a TGI server, for example on HuggingFace's
+serverless inference.
+
+__configuration file example:__
+
+```yaml
+model:
+ type: "tgi"
+ instance:
+ inference_server_address: ""
+ inference_server_auth: null
+ model_id: null # Optional, only required if the TGI container was launched with model_id pointing to a local directory
+```
diff --git a/docs/source/index.mdx b/docs/source/index.mdx
new file mode 100644
index 000000000..9c055f5e4
--- /dev/null
+++ b/docs/source/index.mdx
@@ -0,0 +1,18 @@
+# Lighteval
+
+🤗 Lighteval is your all-in-one toolkit for evaluating LLMs across multiple
+backends—whether it's
+[transformers](https://github.com/huggingface/transformers),
+[tgi](https://github.com/huggingface/text-generation-inference),
+[vllm](https://github.com/vllm-project/vllm), or
+[nanotron](https://github.com/huggingface/nanotron)—with
+ease. Dive deep into your model’s performance by saving and exploring detailed,
+sample-by-sample results to debug and see how your models stack-up.
+
+Customization at your fingertips: letting you effortlessly create [new
+tasks](adding-a-custom-task) and
+[metrics](adding-a-new-metric)
+tailored to your needs, or browsing all our existing tasks and metrics.
+
+Seamlessly experiment, benchmark, and store your results on the Hugging Face
+Hub, S3, or locally.
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
new file mode 100644
index 000000000..39ac2b897
--- /dev/null
+++ b/docs/source/installation.mdx
@@ -0,0 +1,46 @@
+# Installation
+
+You can install Lighteval either from PyPi or from source.
+
+## From PyPi
+
+```bash
+pip install lighteval
+```
+
+## From source
+Source install is mostly for people who intend to develop on `lighteval`
+
+```bash
+git clone https://github.com/huggingface/lighteval.git
+cd lighteval
+pip install -e .
+```
+
+## Extras
+
+Lighteval has optional dependencies that you can install by specifying the
+appropriate extras group.
+`pip install lighteval[]` or `pip install -e .[]`.
+
+| extra name | description |
+|--------------|---------------------------------------------------------------------------|
+| accelerate | To use accelerate for model and data parallelism with transformers models |
+| tgi | To use Text Generation Inference API to evaluate your model |
+| nanotron | To evaluate nanotron models |
+| quantization | To evaluate quantized models |
+| adapters | To evaluate adapters models (delta and peft) |
+| tensorboardX | To upload your results to tensorboard |
+| vllm | To use vllm as backend for inference |
+| s3 | To upload results to s3 |
+
+
+## Hugging Face login
+
+If you want to push your results to the Hugging Face Hub or evaluate your own
+private models, don't forget to add your access token to the environment
+variable `HF_TOKEN`. You can do this by running:
+
+```bash
+huggingface-cli login
+```
diff --git a/docs/source/metric-list.mdx b/docs/source/metric-list.mdx
new file mode 100644
index 000000000..0ab03afb9
--- /dev/null
+++ b/docs/source/metric-list.mdx
@@ -0,0 +1,76 @@
+# Metric List
+
+## Automatic metrics for multiple choice tasks
+
+These metrics use log-likelihood of the different possible targets.
+- `loglikelihood_acc`: Fraction of instances where the choice with the best logprob was correct - also exists in a faster version for tasks where the possible choices include only one token (`loglikelihood_acc_single_token`)
+- `loglikelihood_acc_norm`: Fraction of instances where the choice with the best logprob, normalized by sequence length, was correct - also exists in a faster version for tasks where the possible choices include only one token (`loglikelihood_acc_norm_single_token`)
+- `loglikelihood_acc_norm_nospace`: Fraction of instances where the choice with the best logprob, normalized by sequence length, was correct, with the first space ignored
+- `loglikelihood_f1`: Corpus level F1 score of the multichoice selection - also exists in a faster version for tasks where the possible choices include only one token (`loglikelihood_f1_single_token`)
+- `mcc`: Matthew's correlation coefficient (a measure of agreement between statistical distributions),
+- `recall_at_1`: Fraction of instances where the choice with the best logprob was correct - also exists in a faster version for tasks where the possible choices include only one token per choice (`recall_at_1_single_token`)
+- `recall_at_2`: Fraction of instances where the choice with the 2nd best logprob or better was correct - also exists in a faster version for tasks where the possible choices include only one token per choice (`recall_at_2_single_token`)
+- `mrr`: Mean reciprocal rank, a measure of the quality of a ranking of choices ordered by correctness/relevance - also exists in a faster version for tasks where the possible choices include only one token (`mrr_single_token`)
+- `target_perplexity`: Perplexity of the different choices available.
+- `acc_golds_likelihood`:: A bit different, it actually checks if the average logprob of a single target is above or below 0.5
+- `multi_f1_numeric`: Loglikelihood F1 score for multiple gold targets
+
+All these metrics also exist in a "single token" version (`loglikelihood_acc_single_token`, `loglikelihood_acc_norm_single_token`, `loglikelihood_f1_single_token`, `mcc_single_token`, `recall@2_single_token` and `mrr_single_token`). When the multichoice option compares only one token (ex: "A" vs "B" vs "C" vs "D", or "yes" vs "no"), using these metrics in the single token version will divide the time spent by the number of choices. Single token evals also include:
+- `multi_f1_numeric`: computes the f1 score of all possible choices and averages it.
+
+## Automatic metrics for perplexity and language modeling
+These metrics use log-likelihood of prompt.
+- `word_perplexity`: Perplexity (log probability of the input) weighted by the number of words of the sequence.
+- `byte_perplexity`: Perplexity (log probability of the input) weighted by the number of bytes of the sequence.
+- `bits_per_byte`: Average number of bits per byte according to model probabilities.
+- `log_prob`: Predicted output's average log probability (input's log prob for language modeling).
+
+## Automatic metrics for generative tasks
+These metrics need the model to generate an output. They are therefore slower.
+- Base:
+ - `perfect_exact_match`: Fraction of instances where the prediction matches the gold exactly.
+ - `exact_match`: Fraction of instances where the prediction matches the gold with the exception of the border whitespaces (= after a `strip` has been applied to both).
+ - `quasi_exact_match`: Fraction of instances where the normalized prediction matches the normalized gold (normalization done on whitespace, articles, capitalization, ...). Other variations exist, with other normalizers, such as `quasi_exact_match_triviaqa`, which only normalizes the predictions after applying a strip to all sentences.
+ - `prefix_exact_match`: Fraction of instances where the beginning of the prediction matches the gold at the exception of the border whitespaces (= after a `strip` has been applied to both).
+ - `prefix_quasi_exact_match`: Fraction of instances where the normalized beginning of the prediction matches the normalized gold (normalization done on whitespace, articles, capitalization, ...)
+ - `exact_match_indicator`: Exact match with some preceding context (before an indicator) removed
+ - `f1_score_quasi`: Average F1 score in terms of word overlap between the model output and gold, with both being normalized first
+ - `f1_score`: Average F1 score in terms of word overlap between the model output and gold without normalisation
+ - `f1_score_macro`: Corpus level macro F1 score
+ - `f1_score_macro`: Corpus level micro F1 score
+ - `maj_at_5` and `maj_at_8`: Model majority vote. Takes n (5 or 8) generations from the model and assumes the most frequent is the actual prediction.
+- Summarization:
+ - `rouge`: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/)
+ - `rouge1`: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 1-gram overlap.
+ - `rouge2`: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.
+ - `rougeL`: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on longest common subsequence overlap.
+ - `rougeLsum`: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on longest common subsequence overlap.
+ - `rouge_t5` (BigBench): Corpus level ROUGE score for all available ROUGE metrics
+ - `faithfulness`: Faithfulness scores based on the SummaC method of [Laban et al. (2022)](https://aclanthology.org/2022.tacl-1.10/).
+ - `extractiveness`: Reports, based on [(Grusky et al., 2018)](https://aclanthology.org/N18-1065/)
+ - `summarization_coverage`: Extent to which the model-generated summaries are extractive fragments from the source document,
+ - `summarization_density`: Extent to which the model-generated summaries are extractive summaries based on the source document,
+ - `summarization_compression`: Extent to which the model-generated summaries are compressed relative to the source document.
+ - `bert_score`: Reports the average BERTScore precision, recall, and f1 score [(Zhang et al., 2020)](https://openreview.net/pdf?id=SkeHuCVFDr) between model generation and gold summary.
+ - Translation
+ - `bleu`: Corpus level BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) - uses the sacrebleu implementation.
+ - `bleu_1`: Average sample BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 1-gram overlap - uses the nltk implementation.
+ - `bleu_4`: Average sample BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap - uses the nltk implementation.
+ - `chrf`: Character n-gram matches f-score.
+ - `ter`: Translation edit/error rate.
+- Copyright
+ - `copyright`: Reports:
+ - `longest_common_prefix_length`: average length of longest common prefix between model generation and reference,
+ - `edit_distance`: average Levenshtein edit distance between model generation and reference,
+ - `edit_similarity`: average Levenshtein edit similarity (normalized by length of longer sequence) between model generation and reference.
+- Math:
+ - `quasi_exact_match_math`: Fraction of instances where the normalized prediction matches the normalized gold (normalization done for math, where latex symbols, units, etc are removed)
+ - `maj_at_4_math`: Majority choice evaluation, using the math normalisation for the predictions and gold
+ - `quasi_exact_match_gsm8k`: Fraction of instances where the normalized prediction matches the normalized gold (normalization done for gsm8k, where latex symbols, units, etc are removed)
+ - `maj_at_8_gsm8k`: Majority choice evaluation, using the gsm8k normalisation for the predictions and gold
+
+## LLM-as-Judge:
+- `llm_judge_gpt3p5`: Can be used for any generative task, the model will be scored by a GPT3.5 model using the OpenAI API
+- `llm_judge_llama_3_405b`: Can be used for any generative task, the model will be scored by a Llama 3.405B model using the HuggingFace API
+- `llm_judge_multi_turn_gpt3p5`: Can be used for any generative task, the model will be scored by a GPT3.5 model using the OpenAI API. It is used for multiturn tasks like mt-bench.
+- `llm_judge_multi_turn_llama_3_405b`: Can be used for any generative task, the model will be scored by a Llama 3.405B model using the HuggingFace API. It is used for multiturn tasks like mt-bench.
diff --git a/docs/source/quicktour.mdx b/docs/source/quicktour.mdx
new file mode 100644
index 000000000..5f66547e9
--- /dev/null
+++ b/docs/source/quicktour.mdx
@@ -0,0 +1,160 @@
+# Quicktour
+
+We provide two main entry points to evaluate models:
+
+- `lighteval accelerate` : evaluate models on CPU or one or more GPUs using [🤗
+ Accelerate](https://github.com/huggingface/accelerate)
+- `lighteval nanotron`: evaluate models in distributed settings using [⚡️
+ Nanotron](https://github.com/huggingface/nanotron)
+
+## Accelerate
+
+### Evaluate a model on a GPU
+
+To evaluate `GPT-2` on the Truthful QA benchmark, run:
+
+```bash
+lighteval accelerate \
+ --model_args "pretrained=gpt2" \
+ --tasks "leaderboard|truthfulqa:mc|0|0" \
+ --override_batch_size 1 \
+ --output_dir="./evals/"
+```
+
+Here, `--tasks` refers to either a comma-separated list of supported tasks from
+the [tasks_list](available-tasks) in the format:
+
+```bash
+{suite}|{task}|{num_few_shot}|{0 or 1 to automatically reduce `num_few_shot` if prompt is too long}
+```
+
+or a file path like
+[examples/tasks/recommended_set.txt](https://github.com/huggingface/lighteval/blob/main/examples/tasks/recommended_set.txt)
+which specifies multiple task configurations.
+
+Tasks details can be found in the
+[file](https://github.com/huggingface/lighteval/blob/main/src/lighteval/tasks/default_tasks.py)
+implementing them.
+
+### Evaluate a model on one or more GPUs
+
+#### Data parallelism
+
+To evaluate a model on one or more GPUs, first create a multi-gpu config by running.
+
+```bash
+accelerate config
+```
+
+You can then evaluate a model using data parallelism on 8 GPUs like follows:
+
+```bash
+accelerate launch --multi_gpu --num_processes=8 -m \
+ lighteval accelerate \
+ --model_args "pretrained=gpt2" \
+ --tasks "leaderboard|truthfulqa:mc|0|0" \
+ --override_batch_size 1 \
+ --output_dir="./evals/"
+```
+
+Here, `--override_batch_size` defines the batch size per device, so the effective
+batch size will be `override_batch_size * num_gpus`.
+
+#### Pipeline parallelism
+
+To evaluate a model using pipeline parallelism on 2 or more GPUs, run:
+
+```bash
+lighteval accelerate \
+ --model_args "pretrained=gpt2,model_parallel=True" \
+ --tasks "leaderboard|truthfulqa:mc|0|0" \
+ --override_batch_size 1 \
+ --output_dir="./evals/"
+```
+
+This will automatically use accelerate to distribute the model across the GPUs.
+
+> [!TIP]
+> Both data and pipeline parallelism can be combined by setting
+> `model_parallel=True` and using accelerate to distribute the data across the
+GPUs.
+
+### Model Arguments
+
+The `--model_args` argument takes a string representing a list of model
+argument. The arguments allowed vary depending on the backend you use (vllm or
+accelerate).
+
+#### Accelerate
+
+- **pretrained** (str):
+ HuggingFace Hub model ID name or the path to a pre-trained
+ model to load. This is effectively the `pretrained_model_name_or_path`
+ argument of `from_pretrained` in the HuggingFace `transformers` API.
+- **tokenizer** (Optional[str]): HuggingFace Hub tokenizer ID that will be
+ used for tokenization.
+- **multichoice_continuations_start_space** (Optional[bool]): Whether to add a
+ space at the start of each continuation in multichoice generation.
+ For example, context: "What is the capital of France?" and choices: "Paris", "London".
+ Will be tokenized as: "What is the capital of France? Paris" and "What is the capital of France? London".
+ True adds a space, False strips a space, None does nothing
+- **subfolder** (Optional[str]): The subfolder within the model repository.
+- **revision** (str): The revision of the model.
+- **max_gen_toks** (Optional[int]): The maximum number of tokens to generate.
+- **max_length** (Optional[int]): The maximum length of the generated output.
+- **add_special_tokens** (bool, optional, defaults to True): Whether to add special tokens to the input sequences.
+ If `None`, the default value will be set to `True` for seq2seq models (e.g. T5) and
+ `False` for causal models.
+- **model_parallel** (bool, optional, defaults to False):
+ True/False: force to use or not the `accelerate` library to load a large
+ model across multiple devices.
+ Default: None which corresponds to comparing the number of processes with
+ the number of GPUs. If it's smaller => model-parallelism, else not.
+- **dtype** (Union[str, torch.dtype], optional, defaults to None):):
+ Converts the model weights to `dtype`, if specified. Strings get
+ converted to `torch.dtype` objects (e.g. `float16` -> `torch.float16`).
+ Use `dtype="auto"` to derive the type from the model's weights.
+- **device** (Union[int, str]): device to use for model training.
+- **quantization_config** (Optional[BitsAndBytesConfig]): quantization
+ configuration for the model, manually provided to load a normally floating point
+ model at a quantized precision. Needed for 4-bit and 8-bit precision.
+- **trust_remote_code** (bool): Whether to trust remote code during model
+ loading.
+
+#### VLLM
+
+- **pretrained** (str): HuggingFace Hub model ID name or the path to a pre-trained model to load.
+- **gpu_memory_utilisation** (float): The fraction of GPU memory to use.
+- **batch_size** (int): The batch size for model training.
+- **revision** (str): The revision of the model.
+- **dtype** (str, None): The data type to use for the model.
+- **tensor_parallel_size** (int): The number of tensor parallel units to use.
+- **data_parallel_size** (int): The number of data parallel units to use.
+- **max_model_length** (int): The maximum length of the model.
+- **swap_space** (int): The CPU swap space size (GiB) per GPU.
+- **seed** (int): The seed to use for the model.
+- **trust_remote_code** (bool): Whether to trust remote code during model loading.
+- **use_chat_template** (bool): Whether to use the chat template or not.
+- **add_special_tokens** (bool): Whether to add special tokens to the input sequences.
+- **multichoice_continuations_start_space** (bool): Whether to add a space at the start of each continuation in multichoice generation.
+- **subfolder** (Optional[str]): The subfolder within the model repository.
+
+## Nanotron
+
+To evaluate a model trained with nanotron on a single gpu.
+
+> [!WARNING]
+> Nanotron models cannot be evaluated without torchrun.
+
+
+```bash
+ torchrun --standalone --nnodes=1 --nproc-per-node=1 \
+ src/lighteval/__main__.py nanotron \
+ --checkpoint_config_path ../nanotron/checkpoints/10/config.yaml \
+ --lighteval_config_path examples/nanotron/lighteval_config_override_template.yaml
+ ```
+
+The `nproc-per-node` argument should match the data, tensor and pipeline
+parallelism confidured in the `lighteval_config_template.yaml` file.
+That is: `nproc-per-node = data_parallelism * tensor_parallelism *
+pipeline_parallelism`.
diff --git a/docs/source/saving-and-reading-results.mdx b/docs/source/saving-and-reading-results.mdx
new file mode 100644
index 000000000..b50cdee6c
--- /dev/null
+++ b/docs/source/saving-and-reading-results.mdx
@@ -0,0 +1,214 @@
+# Saving and reading results
+
+## Saving results locally
+
+Lighteval will automatically save results and evaluation details in the
+directory set with the `--output_dir` argument. The results will be saved in
+`{output_dir}/results/{model_name}/results_{timestamp}.json`. [Here is an
+example of a result file](#example-of-a-result-file). The output path can be
+any [fsspec](https://filesystem-spec.readthedocs.io/en/latest/index.html)
+compliant path (local, s3, hf hub, gdrive, ftp, etc).
+
+To save the details of the evaluation, you can use the `--save_details`
+argument. The details will be saved in a parquet file
+`{output_dir}/details/{model_name}/{timestamp}/details_{task}_{timestamp}.parquet`.
+
+## Pushing results to the HuggingFace hub
+
+You can push the results and evaluation details to the HuggingFace hub. To do
+so, you need to set the `--push_to_hub` as well as the `--results_org`
+argument. The results will be saved in a dataset with the name at
+`{results_org}/{model_org}/{model_name}`. To push the details, you need to set
+the `--save_details` argument.
+The dataset created will be private by default, you can make it public by
+setting the `--public_run` argument.
+
+
+## Pushing results to Tensorboard
+
+You can push the results to Tensorboard by setting `--push_to_tensorboard`.
+
+
+## How to load and investigate details
+
+### Load from local detail files
+
+```python
+from datasets import load_dataset
+import os
+
+output_dir = "evals_doc"
+model_name = "HuggingFaceH4/zephyr-7b-beta"
+timestamp = "latest"
+task = "lighteval|gsm8k|0"
+
+if timestamp == "latest":
+ path = f"{output_dir}/details/{model_org}/{model_name}/*/"
+ timestamps = glob.glob(path)
+ timestamp = sorted(timestamps)[-1].split("/")[-2]
+ print(f"Latest timestamp: {timestamp}")
+
+details_path = f"{output_dir}/details/{model_name}/{timestamp}/details_{task}_{timestamp}.parquet"
+
+# Load the details
+details = load_dataset("parquet", data_files=details_path, split="train")
+
+for detail in details:
+ print(detail)
+```
+
+### Load from the HuggingFace hub
+
+```python
+from datasets import load_dataset
+
+results_org = "SaylorTwift"
+model_name = "HuggingFaceH4/zephyr-7b-beta"
+sanitized_model_name = model_name.replace("/", "__")
+task = "lighteval|gsm8k|0"
+public_run = False
+
+dataset_path = f"{results_org}/details_{sanitized_model_name}{'_private' if not public_run else ''}"
+details = load_dataset(dataset_path, task.replace("|", "_"), split="latest")
+
+for detail in details:
+ print(detail)
+```
+
+
+The detail file contains the following columns:
+- `choices`: The choices presented to the model in the case of mutlichoice tasks.
+- `gold`: The gold answer.
+- `gold_index`: The index of the gold answer in the choices list.
+- `cont_tokens`: The continuation tokens.
+- `example`: The input in text form.
+- `full_prompt`: The full prompt, that will be inputed to the model.
+- `input_tokens`: The tokens of the full prompt.
+- `instruction`: The instruction given to the model.
+- `metrics`: The metrics computed for the example.
+- `num_asked_few_shots`: The number of few shots asked to the model.
+- `num_effective_few_shots`: The number of effective few shots.
+- `padded`: Whether the input was padded.
+- `pred_logits`: The logits of the model.
+- `predictions`: The predictions of the model.
+- `specifics`: The specifics of the task.
+- `truncated`: Whether the input was truncated.
+
+
+## Example of a result file
+
+```json
+{
+ "config_general": {
+ "lighteval_sha": "203045a8431bc9b77245c9998e05fc54509ea07f",
+ "num_fewshot_seeds": 1,
+ "override_batch_size": 1,
+ "max_samples": 1,
+ "job_id": "",
+ "start_time": 620979.879320166,
+ "end_time": 621004.632108041,
+ "total_evaluation_time_secondes": "24.752787875011563",
+ "model_name": "gpt2",
+ "model_sha": "607a30d783dfa663caf39e06633721c8d4cfcd7e",
+ "model_dtype": null,
+ "model_size": "476.2 MB"
+ },
+ "results": {
+ "lighteval|gsm8k|0": {
+ "qem": 0.0,
+ "qem_stderr": 0.0,
+ "maj@8": 0.0,
+ "maj@8_stderr": 0.0
+ },
+ "all": {
+ "qem": 0.0,
+ "qem_stderr": 0.0,
+ "maj@8": 0.0,
+ "maj@8_stderr": 0.0
+ }
+ },
+ "versions": {
+ "lighteval|gsm8k|0": 0
+ },
+ "config_tasks": {
+ "lighteval|gsm8k": {
+ "name": "gsm8k",
+ "prompt_function": "gsm8k",
+ "hf_repo": "gsm8k",
+ "hf_subset": "main",
+ "metric": [
+ {
+ "metric_name": "qem",
+ "higher_is_better": true,
+ "category": "3",
+ "use_case": "5",
+ "sample_level_fn": "compute",
+ "corpus_level_fn": "mean"
+ },
+ {
+ "metric_name": "maj@8",
+ "higher_is_better": true,
+ "category": "5",
+ "use_case": "5",
+ "sample_level_fn": "compute",
+ "corpus_level_fn": "mean"
+ }
+ ],
+ "hf_avail_splits": [
+ "train",
+ "test"
+ ],
+ "evaluation_splits": [
+ "test"
+ ],
+ "few_shots_split": null,
+ "few_shots_select": "random_sampling_from_train",
+ "generation_size": 256,
+ "generation_grammar": null,
+ "stop_sequence": [
+ "Question="
+ ],
+ "output_regex": null,
+ "num_samples": null,
+ "frozen": false,
+ "suite": [
+ "lighteval"
+ ],
+ "original_num_docs": 1319,
+ "effective_num_docs": 1,
+ "trust_dataset": true,
+ "must_remove_duplicate_docs": null,
+ "version": 0
+ }
+ },
+ "summary_tasks": {
+ "lighteval|gsm8k|0": {
+ "hashes": {
+ "hash_examples": "8517d5bf7e880086",
+ "hash_full_prompts": "8517d5bf7e880086",
+ "hash_input_tokens": "29916e7afe5cb51d",
+ "hash_cont_tokens": "37f91ce23ef6d435"
+ },
+ "truncated": 2,
+ "non_truncated": 0,
+ "padded": 0,
+ "non_padded": 2,
+ "effective_few_shots": 0.0,
+ "num_truncated_few_shots": 0
+ }
+ },
+ "summary_general": {
+ "hashes": {
+ "hash_examples": "5f383c395f01096e",
+ "hash_full_prompts": "5f383c395f01096e",
+ "hash_input_tokens": "ac933feb14f96d7b",
+ "hash_cont_tokens": "9d03fb26f8da7277"
+ },
+ "truncated": 2,
+ "non_truncated": 0,
+ "padded": 0,
+ "non_padded": 2,
+ "num_truncated_few_shots": 0
+ }
+}
+```
diff --git a/docs/source/use-vllm-as-backend.mdx b/docs/source/use-vllm-as-backend.mdx
new file mode 100644
index 000000000..153ff659f
--- /dev/null
+++ b/docs/source/use-vllm-as-backend.mdx
@@ -0,0 +1,53 @@
+# Use VLLM as backend
+
+Lighteval allows you to use `vllm` as backend allowing great speedups.
+To use, simply change the `model_args` to reflect the arguments you want to pass to vllm.
+
+```bash
+lighteval accelerate \
+ --model_args="vllm,pretrained=HuggingFaceH4/zephyr-7b-beta,dtype=float16" \
+ --tasks "leaderboard|truthfulqa:mc|0|0" \
+ --output_dir="./evals/"
+```
+
+`vllm` is able to distribute the model across multiple GPUs using data
+parallelism, pipeline parallelism or tensor parallelism.
+You can choose the parallelism method by setting in the the `model_args`.
+
+For example if you have 4 GPUs you can split it across using `tensor_parallelism`:
+
+```bash
+export VLLM_WORKER_MULTIPROC_METHOD=spawn && lighteval accelerate \
+ --model_args="vllm,pretrained=HuggingFaceH4/zephyr-7b-beta,dtype=float16,tensor_parallel_size=4" \
+ --tasks "leaderboard|truthfulqa:mc|0|0" \
+ --output_dir="./evals/"
+```
+
+Or, if your model fits on a single GPU, you can use `data_parallelism` to speed up the evaluation:
+
+```bash
+lighteval accelerate \
+ --model_args="vllm,pretrained=HuggingFaceH4/zephyr-7b-beta,dtype=float16,data_parallel_size=4" \
+ --tasks "leaderboard|truthfulqa:mc|0|0" \
+ --output_dir="./evals/"
+```
+
+Available arguments for `vllm` can be found in the `VLLMModelConfig`:
+
+- **pretrained** (str): HuggingFace Hub model ID name or the path to a pre-trained model to load.
+- **gpu_memory_utilisation** (float): The fraction of GPU memory to use.
+- **revision** (str): The revision of the model.
+- **dtype** (str, None): The data type to use for the model.
+- **tensor_parallel_size** (int): The number of tensor parallel units to use.
+- **data_parallel_size** (int): The number of data parallel units to use.
+- **max_model_length** (int): The maximum length of the model.
+- **swap_space** (int): The CPU swap space size (GiB) per GPU.
+- **seed** (int): The seed to use for the model.
+- **trust_remote_code** (bool): Whether to trust remote code during model loading.
+- **add_special_tokens** (bool): Whether to add special tokens to the input sequences.
+- **multichoice_continuations_start_space** (bool): Whether to add a space at the start of each continuation in multichoice generation.
+
+> [!WARNING]
+> In the case of OOM issues, you might need to reduce the context size of the
+> model as well as reduce the `gpu_memory_utilisation` parameter.
+
diff --git a/docs/source/using-the-python-api.mdx b/docs/source/using-the-python-api.mdx
new file mode 100644
index 000000000..82238c7f1
--- /dev/null
+++ b/docs/source/using-the-python-api.mdx
@@ -0,0 +1,62 @@
+# Using the Python API
+
+Lighteval can be used from a custom python script. To evaluate a model you will
+need to setup an `evaluation_tracker`, `pipeline_parameters`, `model_config`
+and a `pipeline`.
+
+After that, simply run the pipeline and save the results.
+
+
+```python
+import lighteval
+from lighteval.logging.evaluation_tracker import EvaluationTracker
+from lighteval.models.model_config import VLLMModelConfig
+from lighteval.pipeline import ParallelismManager, Pipeline, PipelineParameters
+from lighteval.utils.utils import EnvConfig
+from lighteval.utils.imports import is_accelerate_available
+
+if is_accelerate_available():
+ from accelerate import Accelerator, InitProcessGroupKwargs
+ accelerator = Accelerator(kwargs_handlers=[InitProcessGroupKwargs(timeout=timedelta(seconds=3000))])
+else:
+ accelerator = None
+
+def main():
+ evaluation_tracker = EvaluationTracker(
+ output_dir="./results",
+ save_details=True,
+ push_to_hub=True,
+ hub_results_org="your user name",
+ )
+
+ pipeline_params = PipelineParameters(
+ launcher_type=ParallelismManager.ACCELERATE,
+ env_config=EnvConfig(cache_dir="tmp/"),
+ # Remove the 2 parameters below once your configuration is tested
+ override_batch_size=1,
+ max_samples=10
+ )
+
+ model_config = VLLMModelConfig(
+ pretrained="HuggingFaceH4/zephyr-7b-beta",
+ dtype="float16",
+ use_chat_template=True,
+ )
+
+ task = "helm|mmlu|5|1"
+
+ pipeline = Pipeline(
+ tasks=task,
+ pipeline_parameters=pipeline_params,
+ evaluation_tracker=evaluation_tracker,
+ model_config=model_config,
+ custom_task_directory=None, # if using a custom task
+ )
+
+ pipeline.evaluate()
+ pipeline.save_and_push_results()
+ pipeline.show_results()
+
+if __name__ == "__main__":
+ main()
+```
diff --git a/pyproject.toml b/pyproject.toml
index e736be66a..a779ebf4c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -92,6 +92,7 @@ vllm = ["vllm", "ray", "more_itertools"]
quality = ["ruff==v0.2.2","pre-commit"]
tests = ["pytest==7.4.0"]
dev = ["lighteval[accelerate,quality,tests,multilingual]"]
+docs = ["hf-doc-builder", "watchdog"]
extended_tasks = [
"langdetect", # ifeval
"openai", # llm as a judge using openai models