From 0c808016f2937360e5c4e11f38e4c0372e704861 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Thu, 28 Nov 2024 20:29:54 +0100
Subject: [PATCH] Set up docs (#403)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add docs

* Add wiki to docs

* Adapt wiki as docs

* Force docs build

* Fix link in _toctree

* Add titles to docs pages

* Update docs/source/evaluate-the-model-on-a-server-or-container.mdx

Co-authored-by: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>

---------

Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com>
Co-authored-by: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
---
 .github/workflows/doc-build.yml               |   18 +
 .github/workflows/doc-pr-build.yml            |   16 +
 docs/source/_toctree.yml                      |   30 +
 docs/source/adding-a-custom-task.mdx          |  196 +++
 docs/source/adding-a-new-metric.mdx           |   93 ++
 docs/source/available-tasks.mdx               | 1250 +++++++++++++++++
 ...ntributing-to-multilingual-evaluations.mdx |  107 ++
 ...ate-the-model-on-a-server-or-container.mdx |   67 +
 docs/source/index.mdx                         |   18 +
 docs/source/installation.mdx                  |   46 +
 docs/source/metric-list.mdx                   |   76 +
 docs/source/quicktour.mdx                     |  160 +++
 docs/source/saving-and-reading-results.mdx    |  214 +++
 docs/source/use-vllm-as-backend.mdx           |   53 +
 docs/source/using-the-python-api.mdx          |   62 +
 pyproject.toml                                |    1 +
 16 files changed, 2407 insertions(+)
 create mode 100644 .github/workflows/doc-build.yml
 create mode 100644 .github/workflows/doc-pr-build.yml
 create mode 100644 docs/source/_toctree.yml
 create mode 100644 docs/source/adding-a-custom-task.mdx
 create mode 100644 docs/source/adding-a-new-metric.mdx
 create mode 100644 docs/source/available-tasks.mdx
 create mode 100644 docs/source/contributing-to-multilingual-evaluations.mdx
 create mode 100644 docs/source/evaluate-the-model-on-a-server-or-container.mdx
 create mode 100644 docs/source/index.mdx
 create mode 100644 docs/source/installation.mdx
 create mode 100644 docs/source/metric-list.mdx
 create mode 100644 docs/source/quicktour.mdx
 create mode 100644 docs/source/saving-and-reading-results.mdx
 create mode 100644 docs/source/use-vllm-as-backend.mdx
 create mode 100644 docs/source/using-the-python-api.mdx

diff --git a/.github/workflows/doc-build.yml b/.github/workflows/doc-build.yml
new file mode 100644
index 000000000..cd345d3d3
--- /dev/null
+++ b/.github/workflows/doc-build.yml
@@ -0,0 +1,18 @@
+name: Build Documentation
+
+on:
+  push:
+    branches:
+      - main
+      - doc-builder*
+      - v*-release
+
+jobs:
+  build:
+    uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main
+    with:
+      commit_sha: ${{ github.sha }}
+      package: lighteval
+    secrets:
+      token: ${{ secrets.HUGGINGFACE_PUSH }}
+      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
diff --git a/.github/workflows/doc-pr-build.yml b/.github/workflows/doc-pr-build.yml
new file mode 100644
index 000000000..f96e20583
--- /dev/null
+++ b/.github/workflows/doc-pr-build.yml
@@ -0,0 +1,16 @@
+name: Build PR Documentation
+
+on:
+  pull_request:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+    uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
+    with:
+      commit_sha: ${{ github.event.pull_request.head.sha }}
+      pr_number: ${{ github.event.number }}
+      package: lighteval
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
new file mode 100644
index 000000000..243462b3d
--- /dev/null
+++ b/docs/source/_toctree.yml
@@ -0,0 +1,30 @@
+- sections:
+  - local: index
+    title: 🤗 Lighteval
+  - local: installation
+    title: Installation
+  - local: quicktour
+    title: Quicktour
+  title: Getting started
+- sections:
+  - local: saving-and-reading-results
+    title: Save and read results
+  - local: using-the-python-api
+    title: Use the Python API
+  - local: adding-a-custom-task
+    title: Add a custom task
+  - local: adding-a-new-metric
+    title: Add a custom metric
+  - local: use-vllm-as-backend
+    title: Use VLLM as backend
+  - local: evaluate-the-model-on-a-server-or-container
+    title: Evaluate on Server
+  - local: contributing-to-multilingual-evaluations
+    title: Contributing to multilingual evaluations
+  title: Guides
+- sections:
+  - local: metric-list
+    title: Available Metrics
+  - local: available-tasks
+    title: Available Tasks
+  title: API
diff --git a/docs/source/adding-a-custom-task.mdx b/docs/source/adding-a-custom-task.mdx
new file mode 100644
index 000000000..bcaa932ff
--- /dev/null
+++ b/docs/source/adding-a-custom-task.mdx
@@ -0,0 +1,196 @@
+# Adding a Custom Task
+
+To add a new task, first either open an issue, to determine whether it will be
+integrated in the core evaluations of lighteval, in the extended tasks, or the
+community tasks, and add its dataset on the hub.
+
+- Core evaluations are evaluations that only require standard logic in their
+  metrics and processing, and that we will add to our test suite to ensure non
+  regression through time. They already see high usage in the community.
+- Extended evaluations are evaluations that require custom logic in their
+  metrics (complex normalisation, an LLM as a judge, ...), that we added to
+  facilitate the life of users. They already see high usage in the community.
+- Community evaluations are submissions by the community of new tasks.
+
+A popular community evaluation can move to become an extended or core evaluation over time.
+
+> [!TIP]
+> You can find examples of custom tasks in the <a href="https://github.com/huggingface/lighteval/tree/main/community_tasks">community_task</a> directory.
+
+## Step by step creation of a custom task
+
+> [!WARNING]
+> To contribute your custom metric to the lighteval repo, you would first need
+> to install the required dev dependencies by running `pip install -e .[dev]`
+> and then run `pre-commit install` to install the pre-commit hooks.
+
+First, create a python file under the `community_tasks` directory.
+
+You need to define a prompt function that will convert a line from your
+dataset to a document to be used for evaluation.
+
+```python
+# Define as many as you need for your different tasks
+def prompt_fn(line, task_name: str = None):
+    """Defines how to go from a dataset line to a doc object.
+    Follow examples in src/lighteval/tasks/default_prompts.py, or get more info
+    about what this function should do in the README.
+    """
+    return Doc(
+        task_name=task_name,
+        query=line["question"],
+        choices=[f" {c}" for c in line["choices"]],
+        gold_index=line["gold"],
+        instruction="",
+    )
+```
+
+Then, you need to choose a metric, you can either use an existing one (defined
+in `lighteval/metrics/metrics.py`) or [create a custom one](adding-a-new-metric)).
+
+```python
+custom_metric = SampleLevelMetric(
+    metric_name="my_custom_metric_name",
+    higher_is_better=True,
+    category=MetricCategory.IGNORED,
+    use_case=MetricUseCase.NONE,
+    sample_level_fn=lambda x: x,  # how to compute score for one sample
+    corpus_level_fn=np.mean,  # How to aggreagte the samples metrics
+)
+```
+
+Then, you need to define your task. You can define a task with or without subsets.
+To define a task with no subsets:
+
+```python
+# This is how you create a simple task (like hellaswag) which has one single subset
+# attached to it, and one evaluation possible.
+task = LightevalTaskConfig(
+    name="myothertask",
+    prompt_function=prompt_fn,  # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
+    suite=["community"],
+    hf_repo="",
+    hf_subset="default",
+    hf_avail_splits=[],
+    evaluation_splits=[],
+    few_shots_split=None,
+    few_shots_select=None,
+    metric=[],  # select your metric in Metrics
+)
+```
+
+If you want to create a task with multiple subset, add them to the
+`SAMPLE_SUBSETS` list and create a task for each subset.
+
+```python
+SAMPLE_SUBSETS = []  # list of all the subsets to use for this eval
+
+
+class CustomSubsetTask(LightevalTaskConfig):
+    def __init__(
+        self,
+        name,
+        hf_subset,
+    ):
+        super().__init__(
+            name=name,
+            hf_subset=hf_subset,
+            prompt_function=prompt_fn,  # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
+            hf_repo="",
+            metric=[custom_metric],  # select your metric in Metrics or use your custom_metric
+            hf_avail_splits=[],
+            evaluation_splits=[],
+            few_shots_split=None,
+            few_shots_select=None,
+            suite=["community"],
+            generation_size=-1,
+            stop_sequence=None,
+            output_regex=None,
+            frozen=False,
+        )
+SUBSET_TASKS = [CustomSubsetTask(name=f"mytask:{subset}", hf_subset=subset) for subset in SAMPLE_SUBSETS]
+```
+
+Here is a list of the parameters and their meaning:
+
+- `name` (str), your evaluation name
+- `suite` (list), the suite(s) to which your evaluation should belong. This
+  field allows us to compare different task implementations and is used as a
+  task selection to differentiate the versions to launch. At the moment, you'll
+  find the keywords ["helm", "bigbench", "original", "lighteval", "community",
+  "custom"]; for core evals, please choose `lighteval`.
+- `prompt_function` (Callable), the prompt function you defined in the step
+  above
+- `hf_repo` (str), the path to your evaluation dataset on the hub
+- `hf_subset` (str), the specific subset you want to use for your evaluation
+  (note: when the dataset has no subset, fill this field with `"default"`, not
+  with `None` or `""`)
+- `hf_avail_splits` (list), all the splits available for your dataset (train,
+  valid or validation, test, other...)
+- `evaluation_splits` (list), the splits you want to use for evaluation
+- `few_shots_split` (str, can be `null`), the specific split from which you
+  want to select samples for your few-shot examples. It should be different
+  from the sets included in `evaluation_splits`
+- `few_shots_select` (str, can be `null`), the method that you will use to
+  select items for your few-shot examples. Can be `null`, or one of:
+    - `balanced` select examples from the `few_shots_split` with balanced
+      labels, to avoid skewing the few shot examples (hence the model
+      generations) toward one specific label
+    - `random` selects examples at random from the `few_shots_split`
+    - `random_sampling` selects new examples at random from the
+      `few_shots_split` for every new item, but if a sampled item is equal to
+      the current one, it is removed from the available samples
+    - `random_sampling_from_train` selects new examples at random from the
+      `few_shots_split` for every new item, but if a sampled item is equal to
+      the current one, it is kept! Only use this if you know what you are
+      doing.
+    - `sequential` selects the first `n` examples of the `few_shots_split`
+- `generation_size` (int), the maximum number of tokens allowed for a
+  generative evaluation. If your evaluation is a log likelihood evaluation
+  (multi-choice), this value should be -1
+- `stop_sequence` (list), a list of strings acting as end of sentence tokens
+  for your generation
+- `metric` (list), the metrics you want to use for your evaluation (see next
+  section for a detailed explanation)
+- `output_regex` (str), A regex string that will be used to filter your
+  generation. (Generative metrics will only select tokens that are between the
+  first and the second sequence matched by the regex. For example, for a regex
+  matching `\n` and a generation `\nModel generation output\nSome other text`
+  the metric will only be fed with `Model generation output`)
+- `frozen` (bool), for now, is set to False, but we will steadily pass all
+  stable tasks to True.
+- `trust_dataset` (bool), set to True if you trust the dataset.
+
+
+Then you need to add your task to the `TASKS_TABLE` list.
+
+```python
+# STORE YOUR EVALS
+
+# tasks with subset:
+TASKS_TABLE = SUBSET_TASKS
+
+# tasks without subset:
+# TASKS_TABLE = [task]
+```
+
+Finally, you need to add a module logic to convert your task to a dict for lighteval.
+
+```python
+# MODULE LOGIC
+# You should not need to touch this
+# Convert to dict for lighteval
+if __name__ == "__main__":
+    print(t.name for t in TASKS_TABLE)
+    print(len(TASKS_TABLE))
+```
+
+Once your file is created you can then run the evaluation with the following command:
+
+```bash
+lighteval accelerate \
+    --model_args "pretrained=HuggingFaceH4/zephyr-7b-beta" \
+    --tasks "community|{custom_task}|{fewshots}|{truncate_few_shot}" \
+    --custom_tasks {path_to_your_custom_task_file} \
+    --output_dir "./evals"
+```
diff --git a/docs/source/adding-a-new-metric.mdx b/docs/source/adding-a-new-metric.mdx
new file mode 100644
index 000000000..e8562af4f
--- /dev/null
+++ b/docs/source/adding-a-new-metric.mdx
@@ -0,0 +1,93 @@
+# Adding a New Metric
+
+First, check if you can use one of the parametrized functions in
+[src.lighteval.metrics.metrics_corpus]() or
+[src.lighteval.metrics.metrics_sample]().
+
+If not, you can use the `custom_task` system to register your new metric:
+
+> [!TIP]
+> To see an example of a custom metric added along with a custom task, look at <a href="">the IFEval custom task</a>.
+
+
+> [!WARNING]
+> To contribute your custom metric to the lighteval repo, you would first need
+> to install the required dev dependencies by running `pip install -e .[dev]`
+> and then run `pre-commit install` to install the pre-commit hooks.
+
+
+- Create a new Python file which should contain the full logic of your metric.
+- The file also needs to start with these imports
+
+```python
+from aenum import extend_enum
+from lighteval.metrics import Metrics
+```
+
+You need to define a sample level metric:
+
+```python
+def custom_metric(predictions: list[str], formatted_doc: Doc, **kwargs) -> bool:
+    response = predictions[0]
+    return response == formatted_doc.choices[formatted_doc.gold_index]
+```
+
+Here the sample level metric only returns one metric, if you want to return multiple metrics per sample you need to return a dictionary with the metrics as keys and the values as values.
+
+```python
+def custom_metric(predictions: list[str], formatted_doc: Doc, **kwargs) -> dict:
+    response = predictions[0]
+    return {"accuracy": response == formatted_doc.choices[formatted_doc.gold_index], "other_metric": 0.5}
+```
+
+Then, you can define an aggregation function if needed, a common aggregation function is `np.mean`.
+
+```python
+def agg_function(items):
+    flat_items = [item for sublist in items for item in sublist]
+    score = sum(flat_items) / len(flat_items)
+    return score
+```
+
+Finally, you can define your metric. If it's a sample level metric, you can use the following code:
+
+```python
+my_custom_metric = SampleLevelMetric(
+    metric_name={custom_metric_name},
+    higher_is_better={either True or False},
+    category={MetricCategory},
+    use_case={MetricUseCase},
+    sample_level_fn=custom_metric,
+    corpus_level_fn=agg_function,
+)
+```
+
+If your metric defines multiple metrics per sample, you can use the following code:
+
+```python
+custom_metric = SampleLevelMetricGrouping(
+    metric_name={submetric_names},
+    higher_is_better={n: {True or False} for n in submetric_names},
+    category={MetricCategory},
+    use_case={MetricUseCase},
+    sample_level_fn=custom_metric,
+    corpus_level_fn={
+        "accuracy": np.mean,
+        "other_metric": agg_function,
+    },
+)
+```
+
+To finish, add the following, so that it adds your metric to our metrics list
+when loaded as a module.
+
+```python
+# Adds the metric to the metric list!
+extend_enum(Metrics, "metric_name", metric_function)
+if __name__ == "__main__":
+    print("Imported metric")
+```
+
+You can then give your custom metric to lighteval by using `--custom-tasks
+path_to_your_file` when launching it.
+
diff --git a/docs/source/available-tasks.mdx b/docs/source/available-tasks.mdx
new file mode 100644
index 000000000..9b167d21e
--- /dev/null
+++ b/docs/source/available-tasks.mdx
@@ -0,0 +1,1250 @@
+# Available Tasks
+
+You can get a list of all the available tasks by running:
+
+```bash
+lighteval tasks --list
+```
+
+## List of tasks
+
+- bigbench:
+  - bigbench|abstract_narrative_understanding
+  - bigbench|anachronisms
+  - bigbench|analogical_similarity
+  - bigbench|analytic_entailment
+  - bigbench|arithmetic_bb
+  - bigbench|ascii_word_recognition
+  - bigbench|authorship_verification
+  - bigbench|auto_categorization
+  - bigbench|auto_debugging
+  - bigbench|bbq_lite_json
+  - bigbench|bridging_anaphora_resolution_barqa
+  - bigbench|causal_judgment
+  - bigbench|cause_and_effect
+  - bigbench|checkmate_in_one
+  - bigbench|chess_state_tracking
+  - bigbench|chinese_remainder_theorem
+  - bigbench|cifar10_classification
+  - bigbench|code_line_description
+  - bigbench|codenames
+  - bigbench|color
+  - bigbench|common_morpheme
+  - bigbench|conceptual_combinations
+  - bigbench|conlang_translation
+  - bigbench|contextual_parametric_knowledge_conflicts
+  - bigbench|coqa_bb
+  - bigbench|crash_blossom
+  - bigbench|crass_ai
+  - bigbench|cryobiology_spanish
+  - bigbench|cryptonite
+  - bigbench|cs_algorithms
+  - bigbench|dark_humor_detection
+  - bigbench|date_understanding
+  - bigbench|disambiguation_qa
+  - bigbench|discourse_marker_prediction
+  - bigbench|disfl_qa
+  - bigbench|dyck_languages
+  - bigbench|elementary_math_qa
+  - bigbench|emoji_movie
+  - bigbench|emojis_emotion_prediction
+  - bigbench|empirical_judgments
+  - bigbench|english_proverbs
+  - bigbench|english_russian_proverbs
+  - bigbench|entailed_polarity
+  - bigbench|entailed_polarity_hindi
+  - bigbench|epistemic_reasoning
+  - bigbench|evaluating_information_essentiality
+  - bigbench|fact_checker
+  - bigbench|fantasy_reasoning
+  - bigbench|few_shot_nlg
+  - bigbench|figure_of_speech_detection
+  - bigbench|formal_fallacies_syllogisms_negation
+  - bigbench|gem
+  - bigbench|gender_inclusive_sentences_german
+  - bigbench|general_knowledge
+  - bigbench|geometric_shapes
+  - bigbench|goal_step_wikihow
+  - bigbench|gre_reading_comprehension
+  - bigbench|hhh_alignment
+  - bigbench|hindi_question_answering
+  - bigbench|hindu_knowledge
+  - bigbench|hinglish_toxicity
+  - bigbench|human_organs_senses
+  - bigbench|hyperbaton
+  - bigbench|identify_math_theorems
+  - bigbench|identify_odd_metaphor
+  - bigbench|implicatures
+  - bigbench|implicit_relations
+  - bigbench|intent_recognition
+  - bigbench|international_phonetic_alphabet_nli
+  - bigbench|international_phonetic_alphabet_transliterate
+  - bigbench|intersect_geometry
+  - bigbench|irony_identification
+  - bigbench|kanji_ascii
+  - bigbench|kannada
+  - bigbench|key_value_maps
+  - bigbench|known_unknowns
+  - bigbench|language_games
+  - bigbench|language_identification
+  - bigbench|linguistic_mappings
+  - bigbench|linguistics_puzzles
+  - bigbench|logic_grid_puzzle
+  - bigbench|logical_args
+  - bigbench|logical_deduction
+  - bigbench|logical_fallacy_detection
+  - bigbench|logical_sequence
+  - bigbench|mathematical_induction
+  - bigbench|matrixshapes
+  - bigbench|metaphor_boolean
+  - bigbench|metaphor_understanding
+  - bigbench|minute_mysteries_qa
+  - bigbench|misconceptions
+  - bigbench|misconceptions_russian
+  - bigbench|mnist_ascii
+  - bigbench|modified_arithmetic
+  - bigbench|moral_permissibility
+  - bigbench|movie_dialog_same_or_different
+  - bigbench|movie_recommendation
+  - bigbench|mult_data_wrangling
+  - bigbench|multiemo
+  - bigbench|natural_instructions
+  - bigbench|navigate
+  - bigbench|nonsense_words_grammar
+  - bigbench|novel_concepts
+  - bigbench|object_counting
+  - bigbench|odd_one_out
+  - bigbench|operators
+  - bigbench|paragraph_segmentation
+  - bigbench|parsinlu_qa
+  - bigbench|parsinlu_reading_comprehension
+  - bigbench|penguins_in_a_table
+  - bigbench|periodic_elements
+  - bigbench|persian_idioms
+  - bigbench|phrase_relatedness
+  - bigbench|physical_intuition
+  - bigbench|physics
+  - bigbench|physics_questions
+  - bigbench|play_dialog_same_or_different
+  - bigbench|polish_sequence_labeling
+  - bigbench|presuppositions_as_nli
+  - bigbench|qa_wikidata
+  - bigbench|question_selection
+  - bigbench|real_or_fake_text
+  - bigbench|reasoning_about_colored_objects
+  - bigbench|repeat_copy_logic
+  - bigbench|rephrase
+  - bigbench|rhyming
+  - bigbench|riddle_sense
+  - bigbench|ruin_names
+  - bigbench|salient_translation_error_detection
+  - bigbench|scientific_press_release
+  - bigbench|semantic_parsing_in_context_sparc
+  - bigbench|semantic_parsing_spider
+  - bigbench|sentence_ambiguity
+  - bigbench|similarities_abstraction
+  - bigbench|simp_turing_concept
+  - bigbench|simple_arithmetic_json
+  - bigbench|simple_arithmetic_json_multiple_choice
+  - bigbench|simple_arithmetic_json_subtasks
+  - bigbench|simple_arithmetic_multiple_targets_json
+  - bigbench|simple_ethical_questions
+  - bigbench|simple_text_editing
+  - bigbench|snarks
+  - bigbench|social_iqa
+  - bigbench|social_support
+  - bigbench|sports_understanding
+  - bigbench|strange_stories
+  - bigbench|strategyqa
+  - bigbench|sufficient_information
+  - bigbench|suicide_risk
+  - bigbench|swahili_english_proverbs
+  - bigbench|swedish_to_german_proverbs
+  - bigbench|symbol_interpretation
+  - bigbench|tellmewhy
+  - bigbench|temporal_sequences
+  - bigbench|tense
+  - bigbench|timedial
+  - bigbench|topical_chat
+  - bigbench|tracking_shuffled_objects
+  - bigbench|understanding_fables
+  - bigbench|undo_permutation
+  - bigbench|unit_conversion
+  - bigbench|unit_interpretation
+  - bigbench|unnatural_in_context_learning
+  - bigbench|vitaminc_fact_verification
+  - bigbench|what_is_the_tao
+  - bigbench|which_wiki_edit
+  - bigbench|wino_x_german
+  - bigbench|winowhy
+  - bigbench|word_sorting
+  - bigbench|word_unscrambling
+
+- harness:
+  - harness|bbh:boolean_expressions
+  - harness|bbh:causal_judgment
+  - harness|bbh:date_understanding
+  - harness|bbh:disambiguation_qa
+  - harness|bbh:dyck_languages
+  - harness|bbh:formal_fallacies
+  - harness|bbh:geometric_shapes
+  - harness|bbh:hyperbaton
+  - harness|bbh:logical_deduction_five_objects
+  - harness|bbh:logical_deduction_seven_objects
+  - harness|bbh:logical_deduction_three_objects
+  - harness|bbh:movie_recommendation
+  - harness|bbh:multistep_arithmetic_two
+  - harness|bbh:navigate
+  - harness|bbh:object_counting
+  - harness|bbh:penguins_in_a_table
+  - harness|bbh:reasoning_about_colored_objects
+  - harness|bbh:ruin_names
+  - harness|bbh:salient_translation_error_detection
+  - harness|bbh:snarks
+  - harness|bbh:sports_understanding
+  - harness|bbh:temporal_sequences
+  - harness|bbh:tracking_shuffled_objects_five_objects
+  - harness|bbh:tracking_shuffled_objects_seven_objects
+  - harness|bbh:tracking_shuffled_objects_three_objects
+  - harness|bbh:web_of_lies
+  - harness|bbh:word_sorting
+  - harness|bigbench:causal_judgment
+  - harness|bigbench:date_understanding
+  - harness|bigbench:disambiguation_qa
+  - harness|bigbench:geometric_shapes
+  - harness|bigbench:logical_deduction_five_objects
+  - harness|bigbench:logical_deduction_seven_objects
+  - harness|bigbench:logical_deduction_three_objects
+  - harness|bigbench:movie_recommendation
+  - harness|bigbench:navigate
+  - harness|bigbench:reasoning_about_colored_objects
+  - harness|bigbench:ruin_names
+  - harness|bigbench:salient_translation_error_detection
+  - harness|bigbench:snarks
+  - harness|bigbench:sports_understanding
+  - harness|bigbench:temporal_sequences
+  - harness|bigbench:tracking_shuffled_objects_five_objects
+  - harness|bigbench:tracking_shuffled_objects_seven_objects
+  - harness|bigbench:tracking_shuffled_objects_three_objects
+  - harness|wikitext:103:document_level
+
+- helm:
+  - helm|babi_qa
+  - helm|bbq
+  - helm|bbq:Age
+  - helm|bbq:Disability_status
+  - helm|bbq:Gender_identity
+  - helm|bbq:Physical_appearance
+  - helm|bbq:Race_ethnicity
+  - helm|bbq:Race_x_SES
+  - helm|bbq:Race_x_gender
+  - helm|bbq:Religion
+  - helm|bbq:SES
+  - helm|bbq:Sexual_orientation
+  - helm|bbq=Nationality
+  - helm|bigbench:auto_debugging
+  - helm|bigbench:bbq_lite_json:age_ambig
+  - helm|bigbench:bbq_lite_json:age_disambig
+  - helm|bigbench:bbq_lite_json:disability_status_ambig
+  - helm|bigbench:bbq_lite_json:disability_status_disambig
+  - helm|bigbench:bbq_lite_json:gender_identity_ambig
+  - helm|bigbench:bbq_lite_json:gender_identity_disambig
+  - helm|bigbench:bbq_lite_json:nationality_ambig
+  - helm|bigbench:bbq_lite_json:nationality_disambig
+  - helm|bigbench:bbq_lite_json:physical_appearance_ambig
+  - helm|bigbench:bbq_lite_json:physical_appearance_disambig
+  - helm|bigbench:bbq_lite_json:race_ethnicity_ambig
+  - helm|bigbench:bbq_lite_json:race_ethnicity_disambig
+  - helm|bigbench:bbq_lite_json:religion_ambig
+  - helm|bigbench:bbq_lite_json:religion_disambig
+  - helm|bigbench:bbq_lite_json:ses_ambig
+  - helm|bigbench:bbq_lite_json:ses_disambig
+  - helm|bigbench:bbq_lite_json:sexual_orientation_ambig
+  - helm|bigbench:bbq_lite_json:sexual_orientation_disambig
+  - helm|bigbench:code_line_description
+  - helm|bigbench:conceptual_combinations:contradictions
+  - helm|bigbench:conceptual_combinations:emergent_properties
+  - helm|bigbench:conceptual_combinations:fanciful_fictional_combinations
+  - helm|bigbench:conceptual_combinations:homonyms
+  - helm|bigbench:conceptual_combinations:invented_words
+  - helm|bigbench:conlang_translation:adna_from
+  - helm|bigbench:conlang_translation:adna_to
+  - helm|bigbench:conlang_translation:atikampe_from
+  - helm|bigbench:conlang_translation:atikampe_to
+  - helm|bigbench:conlang_translation:gornam_from
+  - helm|bigbench:conlang_translation:gornam_to
+  - helm|bigbench:conlang_translation:holuan_from
+  - helm|bigbench:conlang_translation:holuan_to
+  - helm|bigbench:conlang_translation:mkafala_from
+  - helm|bigbench:conlang_translation:mkafala_to
+  - helm|bigbench:conlang_translation:postpositive_english_from
+  - helm|bigbench:conlang_translation:postpositive_english_to
+  - helm|bigbench:conlang_translation:unapuri_from
+  - helm|bigbench:conlang_translation:unapuri_to
+  - helm|bigbench:conlang_translation:vaomi_from
+  - helm|bigbench:conlang_translation:vaomi_to
+  - helm|bigbench:emoji_movie
+  - helm|bigbench:formal_fallacies_syllogisms_negation
+  - helm|bigbench:hindu_knowledge
+  - helm|bigbench:known_unknowns
+  - helm|bigbench:language_identification
+  - helm|bigbench:linguistics_puzzles
+  - helm|bigbench:logic_grid_puzzle
+  - helm|bigbench:logical_deduction-five_objects
+  - helm|bigbench:logical_deduction-seven_objects
+  - helm|bigbench:logical_deduction-three_objects
+  - helm|bigbench:misconceptions_russian
+  - helm|bigbench:novel_concepts
+  - helm|bigbench:operators
+  - helm|bigbench:parsinlu_reading_comprehension
+  - helm|bigbench:play_dialog_same_or_different
+  - helm|bigbench:repeat_copy_logic
+  - helm|bigbench:strange_stories-boolean
+  - helm|bigbench:strange_stories-multiple_choice
+  - helm|bigbench:strategyqa
+  - helm|bigbench:symbol_interpretation-adversarial
+  - helm|bigbench:symbol_interpretation-emoji_agnostic
+  - helm|bigbench:symbol_interpretation-name_agnostic
+  - helm|bigbench:symbol_interpretation-plain
+  - helm|bigbench:symbol_interpretation-tricky
+  - helm|bigbench:vitaminc_fact_verification
+  - helm|bigbench:winowhy
+  - helm|blimp:adjunct_island
+  - helm|blimp:anaphor_gender_agreement
+  - helm|blimp:anaphor_number_agreement
+  - helm|blimp:animate_subject_passive
+  - helm|blimp:animate_subject_trans
+  - helm|blimp:causative
+  - helm|blimp:complex_NP_island
+  - helm|blimp:coordinate_structure_constraint_complex_left_branch
+  - helm|blimp:coordinate_structure_constraint_object_extraction
+  - helm|blimp:determiner_noun_agreement_1
+  - helm|blimp:determiner_noun_agreement_2
+  - helm|blimp:determiner_noun_agreement_irregular_1
+  - helm|blimp:determiner_noun_agreement_irregular_2
+  - helm|blimp:determiner_noun_agreement_with_adj_2
+  - helm|blimp:determiner_noun_agreement_with_adj_irregular_1
+  - helm|blimp:determiner_noun_agreement_with_adj_irregular_2
+  - helm|blimp:determiner_noun_agreement_with_adjective_1
+  - helm|blimp:distractor_agreement_relational_noun
+  - helm|blimp:distractor_agreement_relative_clause
+  - helm|blimp:drop_argument
+  - helm|blimp:ellipsis_n_bar_1
+  - helm|blimp:ellipsis_n_bar_2
+  - helm|blimp:existential_there_object_raising
+  - helm|blimp:existential_there_quantifiers_1
+  - helm|blimp:existential_there_quantifiers_2
+  - helm|blimp:existential_there_subject_raising
+  - helm|blimp:expletive_it_object_raising
+  - helm|blimp:inchoative
+  - helm|blimp:intransitive
+  - helm|blimp:irregular_past_participle_adjectives
+  - helm|blimp:irregular_past_participle_verbs
+  - helm|blimp:irregular_plural_subject_verb_agreement_1
+  - helm|blimp:irregular_plural_subject_verb_agreement_2
+  - helm|blimp:left_branch_island_echo_question
+  - helm|blimp:left_branch_island_simple_question
+  - helm|blimp:matrix_question_npi_licensor_present
+  - helm|blimp:npi_present_1
+  - helm|blimp:npi_present_2
+  - helm|blimp:only_npi_licensor_present
+  - helm|blimp:only_npi_scope
+  - helm|blimp:passive_1
+  - helm|blimp:passive_2
+  - helm|blimp:principle_A_c_command
+  - helm|blimp:principle_A_case_1
+  - helm|blimp:principle_A_case_2
+  - helm|blimp:principle_A_domain_1
+  - helm|blimp:principle_A_domain_2
+  - helm|blimp:principle_A_domain_3
+  - helm|blimp:principle_A_reconstruction
+  - helm|blimp:regular_plural_subject_verb_agreement_1
+  - helm|blimp:regular_plural_subject_verb_agreement_2
+  - helm|blimp:sentential_negation_npi_licensor_present
+  - helm|blimp:sentential_negation_npi_scope
+  - helm|blimp:sentential_subject_island
+  - helm|blimp:superlative_quantifiers_1
+  - helm|blimp:superlative_quantifiers_2
+  - helm|blimp:tough_vs_raising_1
+  - helm|blimp:tough_vs_raising_2
+  - helm|blimp:transitive
+  - helm|blimp:wh_island
+  - helm|blimp:wh_questions_object_gap
+  - helm|blimp:wh_questions_subject_gap
+  - helm|blimp:wh_questions_subject_gap_long_distance
+  - helm|blimp:wh_vs_that_no_gap
+  - helm|blimp:wh_vs_that_no_gap_long_distance
+  - helm|blimp:wh_vs_that_with_gap
+  - helm|blimp:wh_vs_that_with_gap_long_distance
+  - helm|bold
+  - helm|bold:gender
+  - helm|bold:political_ideology
+  - helm|bold:profession
+  - helm|bold:race
+  - helm|bold:religious_ideology
+  - helm|boolq
+  - helm|boolq:contrastset
+  - helm|civil_comments
+  - helm|civil_comments:LGBTQ
+  - helm|civil_comments:black
+  - helm|civil_comments:christian
+  - helm|civil_comments:female
+  - helm|civil_comments:male
+  - helm|civil_comments:muslim
+  - helm|civil_comments:other_religions
+  - helm|civil_comments:white
+  - helm|commonsenseqa
+  - helm|copyright:n_books_1000-extractions_per_book_1-prefix_length_125
+  - helm|copyright:n_books_1000-extractions_per_book_1-prefix_length_25
+  - helm|copyright:n_books_1000-extractions_per_book_1-prefix_length_5
+  - helm|copyright:n_books_1000-extractions_per_book_3-prefix_length_125
+  - helm|copyright:n_books_1000-extractions_per_book_3-prefix_length_25
+  - helm|copyright:n_books_1000-extractions_per_book_3-prefix_length_5
+  - helm|copyright:oh_the_places
+  - helm|copyright:pilot
+  - helm|copyright:popular_books-prefix_length_10
+  - helm|copyright:popular_books-prefix_length_125
+  - helm|copyright:popular_books-prefix_length_25
+  - helm|copyright:popular_books-prefix_length_250
+  - helm|copyright:popular_books-prefix_length_5
+  - helm|copyright:popular_books-prefix_length_50
+  - helm|copyright:prompt_num_line_1-min_lines_20
+  - helm|copyright:prompt_num_line_10-min_lines_20
+  - helm|copyright:prompt_num_line_5-min_lines_20
+  - helm|covid_dialogue
+  - helm|dyck_language:2
+  - helm|dyck_language:3
+  - helm|dyck_language:4
+  - helm|entity_data_imputation:Buy
+  - helm|entity_data_imputation:Restaurant
+  - helm|entity_matching:Abt_Buy
+  - helm|entity_matching:Amazon_Google
+  - helm|entity_matching:Beer
+  - helm|entity_matching:Company
+  - helm|entity_matching:DBLP_ACM
+  - helm|entity_matching:DBLP_GoogleScholar
+  - helm|entity_matching:Dirty_DBLP_ACM
+  - helm|entity_matching:Dirty_DBLP_GoogleScholar
+  - helm|entity_matching:Dirty_Walmart_Amazon
+  - helm|entity_matching:Dirty_iTunes_Amazon
+  - helm|entity_matching:Walmart_Amazon
+  - helm|entity_matching:iTunes_Amazon
+  - helm|entity_matching=Fodors_Zagats
+  - helm|hellaswag
+  - helm|imdb
+  - helm|imdb:contrastset
+  - helm|interactive_qa_mmlu:abstract_algebra
+  - helm|interactive_qa_mmlu:college_chemistry
+  - helm|interactive_qa_mmlu:global_facts
+  - helm|interactive_qa_mmlu:miscellaneous
+  - helm|interactive_qa_mmlu:nutrition
+  - helm|interactive_qa_mmlu:us_foreign_policy
+  - helm|legal_summarization:billsum
+  - helm|legal_summarization:eurlexsum
+  - helm|legal_summarization:multilexsum
+  - helm|legalsupport
+  - helm|lexglue:case_hold
+  - helm|lexglue:ecthr_a
+  - helm|lexglue:ecthr_b
+  - helm|lexglue:eurlex
+  - helm|lexglue:ledgar
+  - helm|lexglue:scotus
+  - helm|lexglue:unfair_tos
+  - helm|lextreme:brazilian_court_decisions_judgment
+  - helm|lextreme:brazilian_court_decisions_unanimity
+  - helm|lextreme:covid19_emergency_event
+  - helm|lextreme:german_argument_mining
+  - helm|lextreme:greek_legal_code_chapter
+  - helm|lextreme:greek_legal_code_subject
+  - helm|lextreme:greek_legal_code_volume
+  - helm|lextreme:greek_legal_ner
+  - helm|lextreme:legalnero
+  - helm|lextreme:lener_br
+  - helm|lextreme:mapa_coarse
+  - helm|lextreme:mapa_fine
+  - helm|lextreme:multi_eurlex_level_1
+  - helm|lextreme:multi_eurlex_level_2
+  - helm|lextreme:multi_eurlex_level_3
+  - helm|lextreme:online_terms_of_service_clause_topics
+  - helm|lextreme:online_terms_of_service_unfairness_levels
+  - helm|lextreme:swiss_judgment_prediction
+  - helm|lsat_qa
+  - helm|lsat_qa:assignment
+  - helm|lsat_qa:grouping
+  - helm|lsat_qa:miscellaneous
+  - helm|lsat_qa:ordering
+  - helm|me_q_sum
+  - helm|med_dialog:healthcaremagic
+  - helm|med_dialog:icliniq
+  - helm|med_mcqa
+  - helm|med_paragraph_simplification
+  - helm|med_qa
+  - helm|mmlu
+  - helm|mmlu:abstract_algebra
+  - helm|mmlu:anatomy
+  - helm|mmlu:astronomy
+  - helm|mmlu:business_ethics
+  - helm|mmlu:clinical_knowledge
+  - helm|mmlu:college_biology
+  - helm|mmlu:college_chemistry
+  - helm|mmlu:college_computer_science
+  - helm|mmlu:college_mathematics
+  - helm|mmlu:college_medicine
+  - helm|mmlu:college_physics
+  - helm|mmlu:computer_security
+  - helm|mmlu:conceptual_physics
+  - helm|mmlu:econometrics
+  - helm|mmlu:electrical_engineering
+  - helm|mmlu:elementary_mathematics
+  - helm|mmlu:formal_logic
+  - helm|mmlu:global_facts
+  - helm|mmlu:high_school_biology
+  - helm|mmlu:high_school_chemistry
+  - helm|mmlu:high_school_computer_science
+  - helm|mmlu:high_school_european_history
+  - helm|mmlu:high_school_geography
+  - helm|mmlu:high_school_government_and_politics
+  - helm|mmlu:high_school_macroeconomics
+  - helm|mmlu:high_school_mathematics
+  - helm|mmlu:high_school_microeconomics
+  - helm|mmlu:high_school_physics
+  - helm|mmlu:high_school_psychology
+  - helm|mmlu:high_school_statistics
+  - helm|mmlu:high_school_us_history
+  - helm|mmlu:high_school_world_history
+  - helm|mmlu:human_aging
+  - helm|mmlu:human_sexuality
+  - helm|mmlu:international_law
+  - helm|mmlu:jurisprudence
+  - helm|mmlu:logical_fallacies
+  - helm|mmlu:machine_learning
+  - helm|mmlu:management
+  - helm|mmlu:marketing
+  - helm|mmlu:medical_genetics
+  - helm|mmlu:miscellaneous
+  - helm|mmlu:moral_disputes
+  - helm|mmlu:moral_scenarios
+  - helm|mmlu:nutrition
+  - helm|mmlu:philosophy
+  - helm|mmlu:prehistory
+  - helm|mmlu:professional_accounting
+  - helm|mmlu:professional_law
+  - helm|mmlu:professional_medicine
+  - helm|mmlu:professional_psychology
+  - helm|mmlu:public_relations
+  - helm|mmlu:security_studies
+  - helm|mmlu:sociology
+  - helm|mmlu:us_foreign_policy
+  - helm|mmlu:virology
+  - helm|mmlu:world_religions
+  - helm|narrativeqa
+  - helm|numeracy:linear_example
+  - helm|numeracy:linear_standard
+  - helm|numeracy:parabola_example
+  - helm|numeracy:parabola_standard
+  - helm|numeracy:paraboloid_example
+  - helm|numeracy:paraboloid_standard
+  - helm|numeracy:plane_example
+  - helm|numeracy:plane_standard
+  - helm|openbookqa
+  - helm|piqa
+  - helm|pubmedqa
+  - helm|quac
+  - helm|raft:ade_corpus_v2
+  - helm|raft:banking_77
+  - helm|raft:neurips_impact_statement_risks
+  - helm|raft:one_stop_english
+  - helm|raft:overruling
+  - helm|raft:semiconductor_org_types
+  - helm|raft:systematic_review_inclusion
+  - helm|raft:tai_safety_research
+  - helm|raft:terms_of_service
+  - helm|raft:tweet_eval_hate
+  - helm|raft:twitter_complaints
+  - helm|real_toxicity_prompts
+  - helm|siqa
+  - helm|summarization:cnn-dm
+  - helm|summarization:xsum
+  - helm|summarization:xsum-sampled
+  - helm|synthetic_reasoning:induction
+  - helm|synthetic_reasoning:natural_easy
+  - helm|synthetic_reasoning:natural_hard
+  - helm|synthetic_reasoning:pattern_match
+  - helm|synthetic_reasoning:variable_substitution
+  - helm|the_pile:arxiv
+  - helm|the_pile:bibliotik
+  - helm|the_pile:commoncrawl
+  - helm|the_pile:dm-mathematics
+  - helm|the_pile:enron
+  - helm|the_pile:europarl
+  - helm|the_pile:freelaw
+  - helm|the_pile:github
+  - helm|the_pile:gutenberg
+  - helm|the_pile:hackernews
+  - helm|the_pile:nih-exporter
+  - helm|the_pile:opensubtitles
+  - helm|the_pile:openwebtext2
+  - helm|the_pile:pubmed-abstracts
+  - helm|the_pile:pubmed-central
+  - helm|the_pile:stackexchange
+  - helm|the_pile:upsto
+  - helm|the_pile:wikipedia
+  - helm|the_pile:youtubesubtitles
+  - helm|truthfulqa
+  - helm|twitterAAE:aa
+  - helm|twitterAAE:white
+  - helm|wikifact:applies_to_jurisdiction
+  - helm|wikifact:atomic_number
+  - helm|wikifact:author
+  - helm|wikifact:award_received
+  - helm|wikifact:basic_form_of_government
+  - helm|wikifact:capital
+  - helm|wikifact:capital_of
+  - helm|wikifact:central_bank
+  - helm|wikifact:composer
+  - helm|wikifact:continent
+  - helm|wikifact:country
+  - helm|wikifact:country_of_citizenship
+  - helm|wikifact:country_of_origin
+  - helm|wikifact:creator
+  - helm|wikifact:currency
+  - helm|wikifact:defendant
+  - helm|wikifact:developer
+  - helm|wikifact:diplomatic_relation
+  - helm|wikifact:director
+  - helm|wikifact:discoverer_or_inventor
+  - helm|wikifact:drug_or_therapy_used_for_treatment
+  - helm|wikifact:educated_at
+  - helm|wikifact:electron_configuration
+  - helm|wikifact:employer
+  - helm|wikifact:field_of_work
+  - helm|wikifact:file_extension
+  - helm|wikifact:genetic_association
+  - helm|wikifact:genre
+  - helm|wikifact:has_part
+  - helm|wikifact:head_of_government
+  - helm|wikifact:head_of_state
+  - helm|wikifact:headquarters_location
+  - helm|wikifact:industry
+  - helm|wikifact:influenced_by
+  - helm|wikifact:instance_of
+  - helm|wikifact:instrument
+  - helm|wikifact:language_of_work_or_name
+  - helm|wikifact:languages_spoken_written_or_signed
+  - helm|wikifact:laws_applied
+  - helm|wikifact:located_in_the_administrative_territorial_entity
+  - helm|wikifact:location
+  - helm|wikifact:location_of_discovery
+  - helm|wikifact:location_of_formation
+  - helm|wikifact:majority_opinion_by
+  - helm|wikifact:manufacturer
+  - helm|wikifact:measured_physical_quantity
+  - helm|wikifact:medical_condition_treated
+  - helm|wikifact:member_of
+  - helm|wikifact:member_of_political_party
+  - helm|wikifact:member_of_sports_team
+  - helm|wikifact:movement
+  - helm|wikifact:named_after
+  - helm|wikifact:native_language
+  - helm|wikifact:number_of_processor_cores
+  - helm|wikifact:occupation
+  - helm|wikifact:office_held_by_head_of_government
+  - helm|wikifact:office_held_by_head_of_state
+  - helm|wikifact:official_language
+  - helm|wikifact:operating_system
+  - helm|wikifact:original_language_of_film_or_TV_show
+  - helm|wikifact:original_network
+  - helm|wikifact:overrules
+  - helm|wikifact:owned_by
+  - helm|wikifact:part_of
+  - helm|wikifact:participating_team
+  - helm|wikifact:place_of_birth
+  - helm|wikifact:place_of_death
+  - helm|wikifact:plaintiff
+  - helm|wikifact:position_held
+  - helm|wikifact:position_played_on_team
+  - helm|wikifact:programming_language
+  - helm|wikifact:recommended_unit_of_measurement
+  - helm|wikifact:record_label
+  - helm|wikifact:religion
+  - helm|wikifact:repealed_by
+  - helm|wikifact:shares_border_with
+  - helm|wikifact:solved_by
+  - helm|wikifact:statement_describes
+  - helm|wikifact:stock_exchange
+  - helm|wikifact:subclass_of
+  - helm|wikifact:subsidiary
+  - helm|wikifact:symptoms_and_signs
+  - helm|wikifact:therapeutic_area
+  - helm|wikifact:time_of_discovery_or_invention
+  - helm|wikifact:twinned_administrative_body
+  - helm|wikifact:work_location
+  - helm|wikitext:103:document_level
+  - helm|wmt14:cs-en
+  - helm|wmt14:de-en
+  - helm|wmt14:fr-en
+  - helm|wmt14:hi-en
+  - helm|wmt14:ru-en
+
+- leaderboard:
+  - leaderboard|arc:challenge
+  - leaderboard|gsm8k
+  - leaderboard|hellaswag
+  - leaderboard|mmlu:abstract_algebra
+  - leaderboard|mmlu:anatomy
+  - leaderboard|mmlu:astronomy
+  - leaderboard|mmlu:business_ethics
+  - leaderboard|mmlu:clinical_knowledge
+  - leaderboard|mmlu:college_biology
+  - leaderboard|mmlu:college_chemistry
+  - leaderboard|mmlu:college_computer_science
+  - leaderboard|mmlu:college_mathematics
+  - leaderboard|mmlu:college_medicine
+  - leaderboard|mmlu:college_physics
+  - leaderboard|mmlu:computer_security
+  - leaderboard|mmlu:conceptual_physics
+  - leaderboard|mmlu:econometrics
+  - leaderboard|mmlu:electrical_engineering
+  - leaderboard|mmlu:elementary_mathematics
+  - leaderboard|mmlu:formal_logic
+  - leaderboard|mmlu:global_facts
+  - leaderboard|mmlu:high_school_biology
+  - leaderboard|mmlu:high_school_chemistry
+  - leaderboard|mmlu:high_school_computer_science
+  - leaderboard|mmlu:high_school_european_history
+  - leaderboard|mmlu:high_school_geography
+  - leaderboard|mmlu:high_school_government_and_politics
+  - leaderboard|mmlu:high_school_macroeconomics
+  - leaderboard|mmlu:high_school_mathematics
+  - leaderboard|mmlu:high_school_microeconomics
+  - leaderboard|mmlu:high_school_physics
+  - leaderboard|mmlu:high_school_psychology
+  - leaderboard|mmlu:high_school_statistics
+  - leaderboard|mmlu:high_school_us_history
+  - leaderboard|mmlu:high_school_world_history
+  - leaderboard|mmlu:human_aging
+  - leaderboard|mmlu:human_sexuality
+  - leaderboard|mmlu:international_law
+  - leaderboard|mmlu:jurisprudence
+  - leaderboard|mmlu:logical_fallacies
+  - leaderboard|mmlu:machine_learning
+  - leaderboard|mmlu:management
+  - leaderboard|mmlu:marketing
+  - leaderboard|mmlu:medical_genetics
+  - leaderboard|mmlu:miscellaneous
+  - leaderboard|mmlu:moral_disputes
+  - leaderboard|mmlu:moral_scenarios
+  - leaderboard|mmlu:nutrition
+  - leaderboard|mmlu:philosophy
+  - leaderboard|mmlu:prehistory
+  - leaderboard|mmlu:professional_accounting
+  - leaderboard|mmlu:professional_law
+  - leaderboard|mmlu:professional_medicine
+  - leaderboard|mmlu:professional_psychology
+  - leaderboard|mmlu:public_relations
+  - leaderboard|mmlu:security_studies
+  - leaderboard|mmlu:sociology
+  - leaderboard|mmlu:us_foreign_policy
+  - leaderboard|mmlu:virology
+  - leaderboard|mmlu:world_religions
+  - leaderboard|truthfulqa:mc
+  - leaderboard|winogrande
+
+- lighteval:
+  - lighteval|agieval:aqua-rat
+  - lighteval|agieval:gaokao-biology
+  - lighteval|agieval:gaokao-chemistry
+  - lighteval|agieval:gaokao-chinese
+  - lighteval|agieval:gaokao-english
+  - lighteval|agieval:gaokao-geography
+  - lighteval|agieval:gaokao-history
+  - lighteval|agieval:gaokao-mathqa
+  - lighteval|agieval:gaokao-physics
+  - lighteval|agieval:logiqa-en
+  - lighteval|agieval:logiqa-zh
+  - lighteval|agieval:lsat-ar
+  - lighteval|agieval:lsat-lr
+  - lighteval|agieval:lsat-rc
+  - lighteval|agieval:sat-en
+  - lighteval|agieval:sat-en-without-passage
+  - lighteval|agieval:sat-math
+  - lighteval|anli
+  - lighteval|anli:r1
+  - lighteval|anli:r2
+  - lighteval|anli:r3
+  - lighteval|arc:easy
+  - lighteval|arithmetic:1dc
+  - lighteval|arithmetic:2da
+  - lighteval|arithmetic:2dm
+  - lighteval|arithmetic:2ds
+  - lighteval|arithmetic:3da
+  - lighteval|arithmetic:3ds
+  - lighteval|arithmetic:4da
+  - lighteval|arithmetic:4ds
+  - lighteval|arithmetic:5da
+  - lighteval|arithmetic:5ds
+  - lighteval|asdiv
+  - lighteval|bigbench:causal_judgment
+  - lighteval|bigbench:date_understanding
+  - lighteval|bigbench:disambiguation_qa
+  - lighteval|bigbench:geometric_shapes
+  - lighteval|bigbench:logical_deduction_five_objects
+  - lighteval|bigbench:logical_deduction_seven_objects
+  - lighteval|bigbench:logical_deduction_three_objects
+  - lighteval|bigbench:movie_recommendation
+  - lighteval|bigbench:navigate
+  - lighteval|bigbench:reasoning_about_colored_objects
+  - lighteval|bigbench:ruin_names
+  - lighteval|bigbench:salient_translation_error_detection
+  - lighteval|bigbench:snarks
+  - lighteval|bigbench:sports_understanding
+  - lighteval|bigbench:temporal_sequences
+  - lighteval|bigbench:tracking_shuffled_objects_five_objects
+  - lighteval|bigbench:tracking_shuffled_objects_seven_objects
+  - lighteval|bigbench:tracking_shuffled_objects_three_objects
+  - lighteval|blimp:adjunct_island
+  - lighteval|blimp:anaphor_gender_agreement
+  - lighteval|blimp:anaphor_number_agreement
+  - lighteval|blimp:animate_subject_passive
+  - lighteval|blimp:animate_subject_trans
+  - lighteval|blimp:causative
+  - lighteval|blimp:complex_NP_island
+  - lighteval|blimp:coordinate_structure_constraint_complex_left_branch
+  - lighteval|blimp:coordinate_structure_constraint_object_extraction
+  - lighteval|blimp:determiner_noun_agreement_1
+  - lighteval|blimp:determiner_noun_agreement_2
+  - lighteval|blimp:determiner_noun_agreement_irregular_1
+  - lighteval|blimp:determiner_noun_agreement_irregular_2
+  - lighteval|blimp:determiner_noun_agreement_with_adj_2
+  - lighteval|blimp:determiner_noun_agreement_with_adj_irregular_1
+  - lighteval|blimp:determiner_noun_agreement_with_adj_irregular_2
+  - lighteval|blimp:determiner_noun_agreement_with_adjective_1
+  - lighteval|blimp:distractor_agreement_relational_noun
+  - lighteval|blimp:distractor_agreement_relative_clause
+  - lighteval|blimp:drop_argument
+  - lighteval|blimp:ellipsis_n_bar_1
+  - lighteval|blimp:ellipsis_n_bar_2
+  - lighteval|blimp:existential_there_object_raising
+  - lighteval|blimp:existential_there_quantifiers_1
+  - lighteval|blimp:existential_there_quantifiers_2
+  - lighteval|blimp:existential_there_subject_raising
+  - lighteval|blimp:expletive_it_object_raising
+  - lighteval|blimp:inchoative
+  - lighteval|blimp:intransitive
+  - lighteval|blimp:irregular_past_participle_adjectives
+  - lighteval|blimp:irregular_past_participle_verbs
+  - lighteval|blimp:irregular_plural_subject_verb_agreement_1
+  - lighteval|blimp:irregular_plural_subject_verb_agreement_2
+  - lighteval|blimp:left_branch_island_echo_question
+  - lighteval|blimp:left_branch_island_simple_question
+  - lighteval|blimp:matrix_question_npi_licensor_present
+  - lighteval|blimp:npi_present_1
+  - lighteval|blimp:npi_present_2
+  - lighteval|blimp:only_npi_licensor_present
+  - lighteval|blimp:only_npi_scope
+  - lighteval|blimp:passive_1
+  - lighteval|blimp:passive_2
+  - lighteval|blimp:principle_A_c_command
+  - lighteval|blimp:principle_A_case_1
+  - lighteval|blimp:principle_A_case_2
+  - lighteval|blimp:principle_A_domain_1
+  - lighteval|blimp:principle_A_domain_2
+  - lighteval|blimp:principle_A_domain_3
+  - lighteval|blimp:principle_A_reconstruction
+  - lighteval|blimp:regular_plural_subject_verb_agreement_1
+  - lighteval|blimp:regular_plural_subject_verb_agreement_2
+  - lighteval|blimp:sentential_negation_npi_licensor_present
+  - lighteval|blimp:sentential_negation_npi_scope
+  - lighteval|blimp:sentential_subject_island
+  - lighteval|blimp:superlative_quantifiers_1
+  - lighteval|blimp:superlative_quantifiers_2
+  - lighteval|blimp:tough_vs_raising_1
+  - lighteval|blimp:tough_vs_raising_2
+  - lighteval|blimp:transitive
+  - lighteval|blimp:wh_island
+  - lighteval|blimp:wh_questions_object_gap
+  - lighteval|blimp:wh_questions_subject_gap
+  - lighteval|blimp:wh_questions_subject_gap_long_distance
+  - lighteval|blimp:wh_vs_that_no_gap
+  - lighteval|blimp:wh_vs_that_no_gap_long_distance
+  - lighteval|blimp:wh_vs_that_with_gap
+  - lighteval|blimp:wh_vs_that_with_gap_long_distance
+  - lighteval|coqa
+  - lighteval|coqa_bb
+  - lighteval|drop
+  - lighteval|ethics:commonsense
+  - lighteval|ethics:deontology
+  - lighteval|ethics:justice
+  - lighteval|ethics:utilitarianism
+  - lighteval|ethics:virtue
+  - lighteval|glue:cola
+  - lighteval|glue:mnli
+  - lighteval|glue:mnli_mismatched
+  - lighteval|glue:mrpc
+  - lighteval|glue:qnli
+  - lighteval|glue:qqp
+  - lighteval|glue:rte
+  - lighteval|glue:sst2
+  - lighteval|glue:stsb
+  - lighteval|glue:wnli
+  - lighteval|gpqa
+  - lighteval|gsm8k
+  - lighteval|headqa:en
+  - lighteval|headqa:es
+  - lighteval|iwslt17:ar-en
+  - lighteval|iwslt17:de-en
+  - lighteval|iwslt17:en-ar
+  - lighteval|iwslt17:en-de
+  - lighteval|iwslt17:en-fr
+  - lighteval|iwslt17:en-ja
+  - lighteval|iwslt17:en-ko
+  - lighteval|iwslt17:en-zh
+  - lighteval|iwslt17:fr-en
+  - lighteval|iwslt17:ja-en
+  - lighteval|iwslt17:ko-en
+  - lighteval|iwslt17:zh-en
+  - lighteval|lambada:openai
+  - lighteval|lambada:openai:de
+  - lighteval|lambada:openai:en
+  - lighteval|lambada:openai:es
+  - lighteval|lambada:openai:fr
+  - lighteval|lambada:openai:it
+  - lighteval|lambada:openai_cloze
+  - lighteval|lambada:standard
+  - lighteval|lambada:standard_cloze
+  - lighteval|logiqa
+  - lighteval|math:algebra
+  - lighteval|math:counting_and_probability
+  - lighteval|math:geometry
+  - lighteval|math:intermediate_algebra
+  - lighteval|math:number_theory
+  - lighteval|math:prealgebra
+  - lighteval|math:precalculus
+  - lighteval|math_cot:algebra
+  - lighteval|math_cot:counting_and_probability
+  - lighteval|math_cot:geometry
+  - lighteval|math_cot:intermediate_algebra
+  - lighteval|math_cot:number_theory
+  - lighteval|math_cot:prealgebra
+  - lighteval|math_cot:precalculus
+  - lighteval|mathqa
+  - lighteval|mgsm:bn
+  - lighteval|mgsm:de
+  - lighteval|mgsm:en
+  - lighteval|mgsm:es
+  - lighteval|mgsm:fr
+  - lighteval|mgsm:ja
+  - lighteval|mgsm:ru
+  - lighteval|mgsm:sw
+  - lighteval|mgsm:te
+  - lighteval|mgsm:th
+  - lighteval|mgsm:zh
+  - lighteval|mtnt2019:en-fr
+  - lighteval|mtnt2019:en-ja
+  - lighteval|mtnt2019:fr-en
+  - lighteval|mtnt2019:ja-en
+  - lighteval|mutual
+  - lighteval|mutual_plus
+  - lighteval|openbookqa
+  - lighteval|piqa
+  - lighteval|prost
+  - lighteval|pubmedqa
+  - lighteval|qa4mre:2011
+  - lighteval|qa4mre:2012
+  - lighteval|qa4mre:2013
+  - lighteval|qasper
+  - lighteval|qasper_ll
+  - lighteval|race:high
+  - lighteval|sciq
+  - lighteval|storycloze:2016
+  - lighteval|storycloze:2018
+  - lighteval|super_glue:boolq
+  - lighteval|super_glue:cb
+  - lighteval|super_glue:copa
+  - lighteval|super_glue:multirc
+  - lighteval|super_glue:rte
+  - lighteval|super_glue:wic
+  - lighteval|super_glue:wsc
+  - lighteval|swag
+  - lighteval|the_pile:arxiv
+  - lighteval|the_pile:bookcorpus2
+  - lighteval|the_pile:books3
+  - lighteval|the_pile:dm-mathematics
+  - lighteval|the_pile:enron
+  - lighteval|the_pile:europarl
+  - lighteval|the_pile:freelaw
+  - lighteval|the_pile:github
+  - lighteval|the_pile:gutenberg
+  - lighteval|the_pile:hackernews
+  - lighteval|the_pile:nih-exporter
+  - lighteval|the_pile:opensubtitles
+  - lighteval|the_pile:openwebtext2
+  - lighteval|the_pile:philpapers
+  - lighteval|the_pile:pile-cc
+  - lighteval|the_pile:pubmed-abstracts
+  - lighteval|the_pile:pubmed-central
+  - lighteval|the_pile:stackexchange
+  - lighteval|the_pile:ubuntu-irc
+  - lighteval|the_pile:uspto
+  - lighteval|the_pile:wikipedia
+  - lighteval|the_pile:youtubesubtitles
+  - lighteval|toxigen
+  - lighteval|triviaqa
+  - lighteval|truthfulqa:gen
+  - lighteval|unscramble:anagrams1
+  - lighteval|unscramble:anagrams2
+  - lighteval|unscramble:cycle_letters
+  - lighteval|unscramble:random_insertion
+  - lighteval|unscramble:reversed_words
+  - lighteval|webqs
+  - lighteval|wikitext:2
+  - lighteval|wmt08:cs-en
+  - lighteval|wmt08:de-en
+  - lighteval|wmt08:en-cs
+  - lighteval|wmt08:en-de
+  - lighteval|wmt08:en-es
+  - lighteval|wmt08:en-fr
+  - lighteval|wmt08:en-hu
+  - lighteval|wmt08:es-en
+  - lighteval|wmt08:fr-en
+  - lighteval|wmt08:hu-en
+  - lighteval|wmt09:cs-en
+  - lighteval|wmt09:de-en
+  - lighteval|wmt09:en-cs
+  - lighteval|wmt09:en-de
+  - lighteval|wmt09:en-es
+  - lighteval|wmt09:en-fr
+  - lighteval|wmt09:en-hu
+  - lighteval|wmt09:en-it
+  - lighteval|wmt09:es-en
+  - lighteval|wmt09:fr-en
+  - lighteval|wmt09:hu-en
+  - lighteval|wmt09:it-en
+  - lighteval|wmt10:cs-en
+  - lighteval|wmt10:de-en
+  - lighteval|wmt10:en-cs
+  - lighteval|wmt10:en-de
+  - lighteval|wmt10:en-es
+  - lighteval|wmt10:en-fr
+  - lighteval|wmt10:es-en
+  - lighteval|wmt10:fr-en
+  - lighteval|wmt11:cs-en
+  - lighteval|wmt11:de-en
+  - lighteval|wmt11:en-cs
+  - lighteval|wmt11:en-de
+  - lighteval|wmt11:en-es
+  - lighteval|wmt11:en-fr
+  - lighteval|wmt11:es-en
+  - lighteval|wmt11:fr-en
+  - lighteval|wmt12:cs-en
+  - lighteval|wmt12:de-en
+  - lighteval|wmt12:en-cs
+  - lighteval|wmt12:en-de
+  - lighteval|wmt12:en-es
+  - lighteval|wmt12:en-fr
+  - lighteval|wmt12:es-en
+  - lighteval|wmt12:fr-en
+  - lighteval|wmt13:cs-en
+  - lighteval|wmt13:de-en
+  - lighteval|wmt13:en-cs
+  - lighteval|wmt13:en-de
+  - lighteval|wmt13:en-es
+  - lighteval|wmt13:en-fr
+  - lighteval|wmt13:en-ru
+  - lighteval|wmt13:es-en
+  - lighteval|wmt13:fr-en
+  - lighteval|wmt13:ru-en
+  - lighteval|wmt14:cs-en
+  - lighteval|wmt14:de-en
+  - lighteval|wmt14:en-cs
+  - lighteval|wmt14:en-de
+  - lighteval|wmt14:en-fr
+  - lighteval|wmt14:en-hi
+  - lighteval|wmt14:en-ru
+  - lighteval|wmt14:fr-en
+  - lighteval|wmt14:hi-en
+  - lighteval|wmt14:ru-en
+  - lighteval|wmt15:cs-en
+  - lighteval|wmt15:de-en
+  - lighteval|wmt15:en-cs
+  - lighteval|wmt15:en-de
+  - lighteval|wmt15:en-fi
+  - lighteval|wmt15:en-fr
+  - lighteval|wmt15:en-ru
+  - lighteval|wmt15:fi-en
+  - lighteval|wmt15:fr-en
+  - lighteval|wmt15:ru-en
+  - lighteval|wmt16:cs-en
+  - lighteval|wmt16:de-en
+  - lighteval|wmt16:en-cs
+  - lighteval|wmt16:en-de
+  - lighteval|wmt16:en-fi
+  - lighteval|wmt16:en-ro
+  - lighteval|wmt16:en-ru
+  - lighteval|wmt16:en-tr
+  - lighteval|wmt16:fi-en
+  - lighteval|wmt16:ro-en
+  - lighteval|wmt16:ru-en
+  - lighteval|wmt16:tr-en
+  - lighteval|wmt17:cs-en
+  - lighteval|wmt17:de-en
+  - lighteval|wmt17:en-cs
+  - lighteval|wmt17:en-de
+  - lighteval|wmt17:en-fi
+  - lighteval|wmt17:en-lv
+  - lighteval|wmt17:en-ru
+  - lighteval|wmt17:en-tr
+  - lighteval|wmt17:en-zh
+  - lighteval|wmt17:fi-en
+  - lighteval|wmt17:lv-en
+  - lighteval|wmt17:ru-en
+  - lighteval|wmt17:tr-en
+  - lighteval|wmt17:zh-en
+  - lighteval|wmt18:cs-en
+  - lighteval|wmt18:de-en
+  - lighteval|wmt18:en-cs
+  - lighteval|wmt18:en-de
+  - lighteval|wmt18:en-et
+  - lighteval|wmt18:en-fi
+  - lighteval|wmt18:en-ru
+  - lighteval|wmt18:en-tr
+  - lighteval|wmt18:en-zh
+  - lighteval|wmt18:et-en
+  - lighteval|wmt18:fi-en
+  - lighteval|wmt18:ru-en
+  - lighteval|wmt18:tr-en
+  - lighteval|wmt18:zh-en
+  - lighteval|wmt19:cs-de
+  - lighteval|wmt19:de-cs
+  - lighteval|wmt19:de-en
+  - lighteval|wmt19:de-fr
+  - lighteval|wmt19:en-cs
+  - lighteval|wmt19:en-de
+  - lighteval|wmt19:en-fi
+  - lighteval|wmt19:en-gu
+  - lighteval|wmt19:en-kk
+  - lighteval|wmt19:en-lt
+  - lighteval|wmt19:en-ru
+  - lighteval|wmt19:en-zh
+  - lighteval|wmt19:fi-en
+  - lighteval|wmt19:fr-de
+  - lighteval|wmt19:gu-en
+  - lighteval|wmt19:kk-en
+  - lighteval|wmt19:lt-en
+  - lighteval|wmt19:ru-en
+  - lighteval|wmt19:zh-en
+  - lighteval|wmt20:cs-en
+  - lighteval|wmt20:de-en
+  - lighteval|wmt20:de-fr
+  - lighteval|wmt20:en-cs
+  - lighteval|wmt20:en-de
+  - lighteval|wmt20:en-iu
+  - lighteval|wmt20:en-ja
+  - lighteval|wmt20:en-km
+  - lighteval|wmt20:en-pl
+  - lighteval|wmt20:en-ps
+  - lighteval|wmt20:en-ru
+  - lighteval|wmt20:en-ta
+  - lighteval|wmt20:en-zh
+  - lighteval|wmt20:fr-de
+  - lighteval|wmt20:iu-en
+  - lighteval|wmt20:ja-en
+  - lighteval|wmt20:km-en
+  - lighteval|wmt20:pl-en
+  - lighteval|wmt20:ps-en
+  - lighteval|wmt20:ru-en
+  - lighteval|wmt20:ta-en
+  - lighteval|wmt20:zh-en
+  - lighteval|wsc273
+  - lighteval|xcopa:en
+  - lighteval|xcopa:et
+  - lighteval|xcopa:ht
+  - lighteval|xcopa:id
+  - lighteval|xcopa:it
+  - lighteval|xcopa:qu
+  - lighteval|xcopa:sw
+  - lighteval|xcopa:ta
+  - lighteval|xcopa:th
+  - lighteval|xcopa:tr
+  - lighteval|xcopa:vi
+  - lighteval|xcopa:zh
+  - lighteval|xstory_cloze:ar
+  - lighteval|xstory_cloze:en
+  - lighteval|xstory_cloze:es
+  - lighteval|xstory_cloze:eu
+  - lighteval|xstory_cloze:hi
+  - lighteval|xstory_cloze:id
+  - lighteval|xstory_cloze:my
+  - lighteval|xstory_cloze:ru
+  - lighteval|xstory_cloze:sw
+  - lighteval|xstory_cloze:te
+  - lighteval|xstory_cloze:zh
+  - lighteval|xwinograd:en
+  - lighteval|xwinograd:fr
+  - lighteval|xwinograd:jp
+  - lighteval|xwinograd:pt
+  - lighteval|xwinograd:ru
+  - lighteval|xwinograd:zh
+
+- original:
+  - original|arc:c:letters
+  - original|arc:c:options
+  - original|arc:c:simple
+  - original|mmlu
+  - original|mmlu:abstract_algebra
+  - original|mmlu:anatomy
+  - original|mmlu:astronomy
+  - original|mmlu:business_ethics
+  - original|mmlu:clinical_knowledge
+  - original|mmlu:college_biology
+  - original|mmlu:college_chemistry
+  - original|mmlu:college_computer_science
+  - original|mmlu:college_mathematics
+  - original|mmlu:college_medicine
+  - original|mmlu:college_physics
+  - original|mmlu:computer_security
+  - original|mmlu:conceptual_physics
+  - original|mmlu:econometrics
+  - original|mmlu:electrical_engineering
+  - original|mmlu:elementary_mathematics
+  - original|mmlu:formal_logic
+  - original|mmlu:global_facts
+  - original|mmlu:high_school_biology
+  - original|mmlu:high_school_chemistry
+  - original|mmlu:high_school_computer_science
+  - original|mmlu:high_school_european_history
+  - original|mmlu:high_school_geography
+  - original|mmlu:high_school_government_and_politics
+  - original|mmlu:high_school_macroeconomics
+  - original|mmlu:high_school_mathematics
+  - original|mmlu:high_school_microeconomics
+  - original|mmlu:high_school_physics
+  - original|mmlu:high_school_psychology
+  - original|mmlu:high_school_statistics
+  - original|mmlu:high_school_us_history
+  - original|mmlu:high_school_world_history
+  - original|mmlu:human_aging
+  - original|mmlu:human_sexuality
+  - original|mmlu:international_law
+  - original|mmlu:jurisprudence
+  - original|mmlu:logical_fallacies
+  - original|mmlu:machine_learning
+  - original|mmlu:management
+  - original|mmlu:marketing
+  - original|mmlu:medical_genetics
+  - original|mmlu:miscellaneous
+  - original|mmlu:moral_disputes
+  - original|mmlu:moral_scenarios
+  - original|mmlu:nutrition
+  - original|mmlu:philosophy
+  - original|mmlu:prehistory
+  - original|mmlu:professional_accounting
+  - original|mmlu:professional_law
+  - original|mmlu:professional_medicine
+  - original|mmlu:professional_psychology
+  - original|mmlu:public_relations
+  - original|mmlu:security_studies
+  - original|mmlu:sociology
+  - original|mmlu:us_foreign_policy
+  - original|mmlu:virology
+  - original|mmlu:world_religions
diff --git a/docs/source/contributing-to-multilingual-evaluations.mdx b/docs/source/contributing-to-multilingual-evaluations.mdx
new file mode 100644
index 000000000..25779bc38
--- /dev/null
+++ b/docs/source/contributing-to-multilingual-evaluations.mdx
@@ -0,0 +1,107 @@
+# Contributing to multilingual evaluations
+
+## Contributing a small translation
+
+We define 19 `literals`, basic keywords or punctuation signs used when creating evaluation prompts in an automatic manner, such as `yes`, `no`, `because`, etc.
+
+We welcome translations in your language!
+
+To contribute, you'll need to
+1. Open the [translation_literals](https://github.com/huggingface/lighteval/blob/main/src/lighteval/tasks/templates/utils/translation_literals.py) file
+2. Edit the file to add or expand the literal for your language of interest. 
+
+```python
+    Language.ENGLISH: TranslationLiterals(
+        language=Language.ENGLISH,
+        question_word="question", # Usage: "Question: How are you?"
+        answer="answer", # Usage: "Answer: I am fine"
+        confirmation_word="right", # Usage: "He is smart, right?"
+        yes="yes", # Usage: "Yes, he is"
+        no="no", # Usage: "No, he is not"
+        also="also", # Usage: "Also, she is smart."
+        cause_word="because", # Usage: "She is smart, because she is tall"
+        effect_word="therefore", # Usage: "He is tall therefore he is smart"
+        or_word="or", # Usage: "He is tall or small"
+        true="true", # Usage: "He is smart, true, false or neither?"
+        false="false", # Usage: "He is smart, true, false or neither?"
+        neither="neither", # Usage: "He is smart, true, false or neither?"
+        # Punctuation and spacing: only adjust if your language uses something different than in English
+        full_stop=".",
+        comma=",",
+        question_mark="?",
+        exclamation_mark="!",
+        word_space=" ",
+        sentence_space=" ",
+        colon=":",
+        # The first characters of your alphabet used in enumerations, if different from English
+        indices=["A", "B", "C", ...]
+    )
+```
+
+3. Open a PR with your modifications! And voilà!
+
+## Contributing a new multilingual task
+
+You should first read our guide on [adding a custom task](adding-a-custom-task), to better understand the different parameters we use. 
+
+Then, you should take a look at the current [multilingual tasks](https://github.com/huggingface/lighteval/blob/main/src/lighteval/tasks/multilingual/tasks.py) file, to understand how they are defined. For multilingual evaluations the `prompt_function` should be implemented by language-adapted template. The template will take care of correct formatting, correct and consistent usage of language adjusted prompt anchors (e.g Question/Answer) and punctuation.
+
+Browse the list of all templates [here](https://github.com/huggingface/lighteval/tree/main/src/lighteval/tasks/templates) to see which are the most adapted to your own task.
+
+Then, when ready, to define your own task, you should:
+1. create a Python file as indicated in the above guide
+2. import the relevant templates for your task type (XNLI, Copa, Multiple choice, Question Answering, etc)
+3. define one or a list of tasks for each relevant language and evaluation formulation (for multichoice) using our parametrizable `LightevalTaskConfig` class
+
+```python
+your_tasks = [
+    LightevalTaskConfig(
+        # Name of your evaluation
+        name=f"evalname_{language.value}_{formulation.name.lower()}",
+        # The evaluation is community contributed 
+        suite=["community"],
+        # This will automatically get the correct metrics for your chosen formulation
+        metric=get_metrics_for_formulation(
+            formulation,
+            [
+                loglikelihood_acc_metric(normalization=None),
+                loglikelihood_acc_metric(normalization=LogProbTokenNorm()),
+                loglikelihood_acc_metric(normalization=LogProbCharNorm()),
+            ],
+        ),
+        # In this function, you choose which template to follow and for which language and formulation
+        prompt_function=get_template_prompt_function(
+            language=language,
+            # then use the adapter to define the mapping between the 
+            # keys of the template (left), and the keys of your dataset
+            # (right)
+            # To know which template keys are required and available,
+            # consult the appropriate adapter type and doc-string.
+            adapter=lambda line: {
+                "key": line["relevant_key"],
+                ...
+            },
+            formulation=formulation,
+        ),
+        # You can also add specific filters to remove irrelevant samples 
+        hf_filter=lambda line: line["label"] in <condition>,
+        # You then select your huggingface dataset as well as 
+        # the splits available for evaluation
+        hf_repo=<dataset>,
+        hf_subset=<subset>,
+        evaluation_splits=["train"],
+        hf_avail_splits=["train"],
+    )
+    for language in [
+        Language.YOUR_LANGUAGE, ...
+    ]
+    for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
+]
+```
+4. then, you can go back to the guide to test if your task is correctly implemented!
+
+> [!TIP]
+> All `LightevalTaskConfig` parameters are strongly typed, including the inputs to the template function. Make sure to take advantage of your IDE's functionality to make it easier to correctly fill these parameters.
+
+
+Once everything is good, open a PR, and we'll be happy to review it!
\ No newline at end of file
diff --git a/docs/source/evaluate-the-model-on-a-server-or-container.mdx b/docs/source/evaluate-the-model-on-a-server-or-container.mdx
new file mode 100644
index 000000000..da8f1d4b7
--- /dev/null
+++ b/docs/source/evaluate-the-model-on-a-server-or-container.mdx
@@ -0,0 +1,67 @@
+# Evaluate the model on a server or container
+
+An alternative to launching the evaluation locally is to serve the model on a
+TGI-compatible server/container and then run the evaluation by sending requests
+to the server. The command is the same as before, except you specify a path to
+a yaml config file (detailed below):
+
+```bash
+lighteval accelerate \
+    --model_config_path="/path/to/config/file"\
+    --tasks <task parameters> \
+    --output_dir output_dir
+```
+
+There are two types of configuration files that can be provided for running on
+the server:
+
+### Hugging Face Inference Endpoints
+
+To launch a model using HuggingFace's Inference Endpoints, you need to provide
+the following file: `endpoint_model.yaml`. Lighteval will automatically deploy
+the endpoint, run the evaluation, and finally delete the endpoint (unless you
+specify an endpoint that was already launched, in which case the endpoint won't
+be deleted afterwards).
+
+__configuration file example:__
+
+```yaml
+model:
+  type: "endpoint"
+  base_params:
+    endpoint_name: "llama-2-7B-lighteval" # needs to be lower case without special characters
+    model: "meta-llama/Llama-2-7b-hf"
+    revision: "main"
+    dtype: "float16" # can be any of "awq", "eetq", "gptq", "4bit' or "8bit" (will use bitsandbytes), "bfloat16" or "float16"
+    reuse_existing: false # if true, ignore all params in instance, and don't delete the endpoint after evaluation
+  instance:
+    accelerator: "gpu"
+    region: "eu-west-1"
+    vendor: "aws"
+    instance_size: "medium"
+    instance_type: "g5.2xlarge"
+    framework: "pytorch"
+    endpoint_type: "protected"
+    namespace: null # The namespace under which to launch the endopint. Defaults to the current user's namespace
+    image_url: null # Optionally specify the docker image to use when launching the endpoint model. E.g., launching models with later releases of the TGI container with support for newer models.
+    env_vars:
+      null # Optional environment variables to include when launching the endpoint. e.g., `MAX_INPUT_LENGTH: 2048`
+  generation:
+    add_special_tokens: true
+```
+
+### Text Generation Inference (TGI)
+
+To use a model already deployed on a TGI server, for example on HuggingFace's
+serverless inference.
+
+__configuration file example:__
+
+```yaml
+model:
+  type: "tgi"
+  instance:
+    inference_server_address: ""
+    inference_server_auth: null
+    model_id: null # Optional, only required if the TGI container was launched with model_id pointing to a local directory
+```
diff --git a/docs/source/index.mdx b/docs/source/index.mdx
new file mode 100644
index 000000000..9c055f5e4
--- /dev/null
+++ b/docs/source/index.mdx
@@ -0,0 +1,18 @@
+# Lighteval
+
+🤗 Lighteval is your all-in-one toolkit for evaluating LLMs across multiple
+backends—whether it's
+[transformers](https://github.com/huggingface/transformers),
+[tgi](https://github.com/huggingface/text-generation-inference),
+[vllm](https://github.com/vllm-project/vllm), or
+[nanotron](https://github.com/huggingface/nanotron)—with
+ease. Dive deep into your model’s performance by saving and exploring detailed,
+sample-by-sample results to debug and see how your models stack-up.
+
+Customization at your fingertips: letting you effortlessly create [new
+tasks](adding-a-custom-task) and
+[metrics](adding-a-new-metric)
+tailored to your needs, or browsing all our existing tasks and metrics.
+
+Seamlessly experiment, benchmark, and store your results on the Hugging Face
+Hub, S3, or locally.
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
new file mode 100644
index 000000000..39ac2b897
--- /dev/null
+++ b/docs/source/installation.mdx
@@ -0,0 +1,46 @@
+# Installation
+
+You can install Lighteval either from PyPi or from source.
+
+## From PyPi
+
+```bash
+pip install lighteval
+```
+
+## From source
+Source install is mostly for people who intend to develop on `lighteval`
+
+```bash
+git clone https://github.com/huggingface/lighteval.git
+cd lighteval
+pip install -e .
+```
+
+## Extras
+
+Lighteval has optional dependencies that you can install by specifying the
+appropriate extras group.
+`pip install lighteval[<group>]` or `pip install -e .[<group>]`.
+
+| extra name   | description                                                               |
+|--------------|---------------------------------------------------------------------------|
+| accelerate   | To use accelerate for model and data parallelism with transformers models |
+| tgi          | To use Text Generation Inference API to evaluate your model               |
+| nanotron     | To evaluate nanotron models                                               |
+| quantization | To evaluate quantized models                                              |
+| adapters     | To evaluate adapters models (delta and peft)                              |
+| tensorboardX | To upload your results to tensorboard                                     |
+| vllm         | To use vllm as backend for inference                                      |
+| s3           | To upload results to s3                                                   |
+
+
+## Hugging Face login
+
+If you want to push your results to the Hugging Face Hub or evaluate your own
+private models, don't forget to add your access token to the environment
+variable `HF_TOKEN`. You can do this by running:
+
+```bash
+huggingface-cli login
+```
diff --git a/docs/source/metric-list.mdx b/docs/source/metric-list.mdx
new file mode 100644
index 000000000..0ab03afb9
--- /dev/null
+++ b/docs/source/metric-list.mdx
@@ -0,0 +1,76 @@
+# Metric List
+
+## Automatic metrics for multiple choice tasks
+
+These metrics use log-likelihood of the different possible targets.
+- `loglikelihood_acc`: Fraction of instances where the choice with the best logprob was correct - also exists in a faster version for tasks where the possible choices include only one token (`loglikelihood_acc_single_token`)
+- `loglikelihood_acc_norm`: Fraction of instances where the choice with the best logprob, normalized by sequence length, was correct - also exists in a faster version for tasks where the possible choices include only one token (`loglikelihood_acc_norm_single_token`)
+- `loglikelihood_acc_norm_nospace`: Fraction of instances where the choice with the best logprob, normalized by sequence length, was correct, with the first space ignored
+- `loglikelihood_f1`: Corpus level F1 score of the multichoice selection - also exists in a faster version for tasks where the possible choices include only one token (`loglikelihood_f1_single_token`)
+- `mcc`: Matthew's correlation coefficient (a measure of agreement between statistical distributions),
+- `recall_at_1`: Fraction of instances where the choice with the best logprob was correct - also exists in a faster version for tasks where the possible choices include only one token per choice (`recall_at_1_single_token`)
+- `recall_at_2`: Fraction of instances where the choice with the 2nd best logprob or better was correct  - also exists in a faster version for tasks where the possible choices include only one token per choice (`recall_at_2_single_token`)
+- `mrr`: Mean reciprocal rank, a measure of the quality of a ranking of choices ordered by correctness/relevance  - also exists in a faster version for tasks where the possible choices include only one token (`mrr_single_token`)
+- `target_perplexity`: Perplexity of the different choices available.
+- `acc_golds_likelihood`:: A bit different, it actually checks if the average logprob of a single target is above or below 0.5
+- `multi_f1_numeric`: Loglikelihood F1 score for multiple gold targets
+
+All these metrics also exist in a "single token" version (`loglikelihood_acc_single_token`, `loglikelihood_acc_norm_single_token`, `loglikelihood_f1_single_token`, `mcc_single_token`, `recall@2_single_token` and `mrr_single_token`). When the multichoice option compares only one token (ex: "A" vs "B" vs "C" vs "D", or "yes" vs "no"), using these metrics in the single token version will divide the time spent by the number of choices. Single token evals also include:
+- `multi_f1_numeric`: computes the f1 score of all possible choices and averages it.
+
+## Automatic metrics for perplexity and language modeling
+These metrics use log-likelihood of prompt.
+- `word_perplexity`: Perplexity (log probability of the input) weighted by the number of words of the sequence.
+- `byte_perplexity`: Perplexity (log probability of the input) weighted by the number of bytes of the sequence.
+- `bits_per_byte`: Average number of bits per byte according to model probabilities.
+- `log_prob`: Predicted output's average log probability (input's log prob for language modeling).
+
+## Automatic metrics for generative tasks
+These metrics need the model to generate an output. They are therefore slower.
+- Base:
+    - `perfect_exact_match`: Fraction of instances where the prediction matches the gold exactly.
+    - `exact_match`: Fraction of instances where the prediction matches the gold with the exception of the border whitespaces (= after a `strip` has been applied to both).
+    - `quasi_exact_match`: Fraction of instances where the normalized prediction matches the normalized gold (normalization done on whitespace, articles, capitalization, ...). Other variations exist, with other normalizers, such as `quasi_exact_match_triviaqa`, which only normalizes the predictions after applying a strip to all sentences.
+    - `prefix_exact_match`: Fraction of instances where the beginning of the prediction matches the gold at the exception of the border whitespaces (= after a `strip` has been applied to both).
+    - `prefix_quasi_exact_match`: Fraction of instances where the normalized beginning of the prediction matches the normalized gold (normalization done on whitespace, articles, capitalization, ...)
+    - `exact_match_indicator`: Exact match with some preceding context (before an indicator) removed
+    - `f1_score_quasi`: Average F1 score in terms of word overlap between the model output and gold, with both being normalized first
+    - `f1_score`:  Average F1 score in terms of word overlap between the model output and gold without normalisation
+    - `f1_score_macro`: Corpus level macro F1 score
+    - `f1_score_macro`: Corpus level micro F1 score
+    - `maj_at_5` and `maj_at_8`: Model majority vote. Takes n (5 or 8) generations from the model and assumes the most frequent is the actual prediction.
+- Summarization:
+    - `rouge`: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/)
+    - `rouge1`: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 1-gram overlap.
+    - `rouge2`: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.
+    - `rougeL`: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on longest common subsequence overlap.
+    - `rougeLsum`: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on longest common subsequence overlap.
+    - `rouge_t5` (BigBench): Corpus level ROUGE score for all available ROUGE metrics
+    - `faithfulness`: Faithfulness scores based on the SummaC method of [Laban et al. (2022)](https://aclanthology.org/2022.tacl-1.10/).
+    - `extractiveness`: Reports, based on [(Grusky et al., 2018)](https://aclanthology.org/N18-1065/)
+        - `summarization_coverage`: Extent to which the model-generated summaries are extractive fragments from the source document,
+        - `summarization_density`: Extent to which the model-generated summaries are extractive summaries based on the source document,
+        - `summarization_compression`: Extent to which the model-generated summaries are compressed relative to the source document.
+    - `bert_score`: Reports the average BERTScore precision, recall, and f1 score [(Zhang et al., 2020)](https://openreview.net/pdf?id=SkeHuCVFDr) between model generation and gold summary.
+    - Translation
+    - `bleu`: Corpus level BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) - uses the sacrebleu implementation.
+    - `bleu_1`: Average sample BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 1-gram overlap - uses the nltk implementation.
+    - `bleu_4`: Average sample BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap - uses the nltk implementation.
+    - `chrf`: Character n-gram matches f-score.
+    - `ter`: Translation edit/error rate.
+- Copyright
+    - `copyright`: Reports:
+        - `longest_common_prefix_length`: average length of longest common prefix between model generation and reference,
+        - `edit_distance`: average Levenshtein edit distance between model generation and reference,
+        - `edit_similarity`: average Levenshtein edit similarity (normalized by length of longer sequence) between model generation and reference.
+- Math:
+    - `quasi_exact_match_math`: Fraction of instances where the normalized prediction matches the normalized gold (normalization done for math, where latex symbols, units, etc are removed)
+    - `maj_at_4_math`: Majority choice evaluation, using the math normalisation for the predictions and gold
+    - `quasi_exact_match_gsm8k`: Fraction of instances where the normalized prediction matches the normalized gold (normalization done for gsm8k, where latex symbols, units, etc are removed)
+    - `maj_at_8_gsm8k`: Majority choice evaluation, using the gsm8k normalisation for the predictions and gold
+
+## LLM-as-Judge:
+- `llm_judge_gpt3p5`: Can be used for any generative task, the model will be scored by a GPT3.5 model using the OpenAI API
+- `llm_judge_llama_3_405b`: Can be used for any generative task, the model will be scored by a Llama 3.405B model using the HuggingFace API
+- `llm_judge_multi_turn_gpt3p5`: Can be used for any generative task, the model will be scored by a GPT3.5 model using the OpenAI API. It is used for multiturn tasks like mt-bench.
+- `llm_judge_multi_turn_llama_3_405b`: Can be used for any generative task, the model will be scored by a Llama 3.405B model using the HuggingFace API. It is used for multiturn tasks like mt-bench.
diff --git a/docs/source/quicktour.mdx b/docs/source/quicktour.mdx
new file mode 100644
index 000000000..5f66547e9
--- /dev/null
+++ b/docs/source/quicktour.mdx
@@ -0,0 +1,160 @@
+# Quicktour
+
+We provide two main entry points to evaluate models:
+
+- `lighteval accelerate` : evaluate models on CPU or one or more GPUs using [🤗
+  Accelerate](https://github.com/huggingface/accelerate)
+- `lighteval nanotron`: evaluate models in distributed settings using [⚡️
+  Nanotron](https://github.com/huggingface/nanotron)
+
+## Accelerate
+
+### Evaluate a model on a GPU
+
+To evaluate `GPT-2` on the Truthful QA benchmark, run:
+
+```bash
+lighteval accelerate \
+     --model_args "pretrained=gpt2" \
+     --tasks "leaderboard|truthfulqa:mc|0|0" \
+     --override_batch_size 1 \
+     --output_dir="./evals/"
+```
+
+Here, `--tasks` refers to either a comma-separated list of supported tasks from
+the [tasks_list](available-tasks) in the format:
+
+```bash
+{suite}|{task}|{num_few_shot}|{0 or 1 to automatically reduce `num_few_shot` if prompt is too long}
+```
+
+or a file path like
+[examples/tasks/recommended_set.txt](https://github.com/huggingface/lighteval/blob/main/examples/tasks/recommended_set.txt)
+which specifies multiple task configurations.
+
+Tasks details can be found in the
+[file](https://github.com/huggingface/lighteval/blob/main/src/lighteval/tasks/default_tasks.py)
+implementing them.
+
+### Evaluate a model on one or more GPUs
+
+#### Data parallelism
+
+To evaluate a model on one or more GPUs, first create a multi-gpu config by running.
+
+```bash
+accelerate config
+```
+
+You can then evaluate a model using data parallelism on 8 GPUs like follows:
+
+```bash
+accelerate launch --multi_gpu --num_processes=8 -m \
+    lighteval accelerate \
+    --model_args "pretrained=gpt2" \
+    --tasks "leaderboard|truthfulqa:mc|0|0" \
+    --override_batch_size 1 \
+    --output_dir="./evals/"
+```
+
+Here, `--override_batch_size` defines the batch size per device, so the effective
+batch size will be `override_batch_size * num_gpus`.
+
+#### Pipeline parallelism
+
+To evaluate a model using pipeline parallelism on 2 or more GPUs, run:
+
+```bash
+lighteval accelerate \
+    --model_args "pretrained=gpt2,model_parallel=True" \
+    --tasks "leaderboard|truthfulqa:mc|0|0" \
+    --override_batch_size 1 \
+    --output_dir="./evals/"
+```
+
+This will automatically use accelerate to distribute the model across the GPUs.
+
+> [!TIP]
+> Both data and pipeline parallelism can be combined by setting
+> `model_parallel=True` and using accelerate to distribute the data across the
+GPUs.
+
+### Model Arguments
+
+The `--model_args` argument takes a string representing a list of model
+argument. The arguments allowed vary depending on the backend you use (vllm or
+accelerate).
+
+#### Accelerate
+
+- **pretrained** (str):
+    HuggingFace Hub model ID name or the path to a pre-trained
+    model to load. This is effectively the `pretrained_model_name_or_path`
+    argument of `from_pretrained` in the HuggingFace `transformers` API.
+- **tokenizer** (Optional[str]): HuggingFace Hub tokenizer ID that will be
+    used for tokenization.
+- **multichoice_continuations_start_space** (Optional[bool]): Whether to add a
+    space at the start of each continuation in multichoice generation.
+    For example, context: "What is the capital of France?" and choices: "Paris", "London".
+    Will be tokenized as: "What is the capital of France? Paris" and "What is the capital of France? London".
+    True adds a space, False strips a space, None does nothing
+- **subfolder** (Optional[str]): The subfolder within the model repository.
+- **revision** (str): The revision of the model.
+- **max_gen_toks** (Optional[int]): The maximum number of tokens to generate.
+- **max_length** (Optional[int]): The maximum length of the generated output.
+- **add_special_tokens** (bool, optional, defaults to True): Whether to add special tokens to the input sequences.
+   If `None`, the default value will be set to `True` for seq2seq models (e.g. T5) and
+    `False` for causal models.
+- **model_parallel** (bool, optional, defaults to False):
+    True/False: force to use or not the `accelerate` library to load a large
+    model across multiple devices.
+    Default: None which corresponds to comparing the number of processes with
+        the number of GPUs. If it's smaller => model-parallelism, else not.
+- **dtype** (Union[str, torch.dtype], optional, defaults to None):):
+    Converts the model weights to `dtype`, if specified. Strings get
+    converted to `torch.dtype` objects (e.g. `float16` -> `torch.float16`).
+    Use `dtype="auto"` to derive the type from the model's weights.
+- **device** (Union[int, str]): device to use for model training.
+- **quantization_config** (Optional[BitsAndBytesConfig]): quantization
+    configuration for the model, manually provided to load a normally floating point
+    model at a quantized precision. Needed for 4-bit and 8-bit precision.
+- **trust_remote_code** (bool): Whether to trust remote code during model
+    loading.
+
+#### VLLM
+
+- **pretrained** (str): HuggingFace Hub model ID name or the path to a pre-trained model to load.
+- **gpu_memory_utilisation** (float): The fraction of GPU memory to use.
+- **batch_size** (int): The batch size for model training.
+- **revision** (str): The revision of the model.
+- **dtype** (str, None): The data type to use for the model.
+- **tensor_parallel_size** (int): The number of tensor parallel units to use.
+- **data_parallel_size** (int): The number of data parallel units to use.
+- **max_model_length** (int): The maximum length of the model.
+- **swap_space** (int): The CPU swap space size (GiB) per GPU.
+- **seed** (int): The seed to use for the model.
+- **trust_remote_code** (bool): Whether to trust remote code during model loading.
+- **use_chat_template** (bool): Whether to use the chat template or not.
+- **add_special_tokens** (bool): Whether to add special tokens to the input sequences.
+- **multichoice_continuations_start_space** (bool): Whether to add a space at the start of each continuation in multichoice generation.
+- **subfolder** (Optional[str]): The subfolder within the model repository.
+
+## Nanotron
+
+To evaluate a model trained with nanotron on a single gpu.
+
+> [!WARNING]
+> Nanotron models cannot be evaluated without torchrun.
+
+
+```bash
+ torchrun --standalone --nnodes=1 --nproc-per-node=1  \
+ src/lighteval/__main__.py nanotron \
+ --checkpoint_config_path ../nanotron/checkpoints/10/config.yaml \
+ --lighteval_config_path examples/nanotron/lighteval_config_override_template.yaml
+ ```
+
+The `nproc-per-node` argument should match the data, tensor and pipeline
+parallelism confidured in the `lighteval_config_template.yaml` file.
+That is: `nproc-per-node = data_parallelism * tensor_parallelism *
+pipeline_parallelism`.
diff --git a/docs/source/saving-and-reading-results.mdx b/docs/source/saving-and-reading-results.mdx
new file mode 100644
index 000000000..b50cdee6c
--- /dev/null
+++ b/docs/source/saving-and-reading-results.mdx
@@ -0,0 +1,214 @@
+# Saving and reading results
+
+## Saving results locally
+
+Lighteval will automatically save results and evaluation details in the
+directory set with the `--output_dir` argument. The results will be saved in
+`{output_dir}/results/{model_name}/results_{timestamp}.json`. [Here is an
+example of a result file](#example-of-a-result-file). The output path can be
+any [fsspec](https://filesystem-spec.readthedocs.io/en/latest/index.html)
+compliant path (local, s3, hf hub, gdrive, ftp, etc).
+
+To save the details of the evaluation, you can use the `--save_details`
+argument. The details will be saved in a parquet file
+`{output_dir}/details/{model_name}/{timestamp}/details_{task}_{timestamp}.parquet`.
+
+## Pushing results to the HuggingFace hub
+
+You can push the results and evaluation details to the HuggingFace hub. To do
+so, you need to set the `--push_to_hub` as well as the `--results_org`
+argument. The results will be saved in a dataset with the name at
+`{results_org}/{model_org}/{model_name}`. To push the details, you need to set
+the `--save_details` argument.
+The dataset created will be private by default, you can make it public by
+setting the `--public_run` argument.
+
+
+## Pushing results to Tensorboard
+
+You can push the results to Tensorboard by setting `--push_to_tensorboard`.
+
+
+## How to load and investigate details
+
+### Load from local detail files
+
+```python
+from datasets import load_dataset
+import os
+
+output_dir = "evals_doc"
+model_name = "HuggingFaceH4/zephyr-7b-beta"
+timestamp = "latest"
+task = "lighteval|gsm8k|0"
+
+if timestamp == "latest":
+    path = f"{output_dir}/details/{model_org}/{model_name}/*/"
+    timestamps = glob.glob(path)
+    timestamp = sorted(timestamps)[-1].split("/")[-2]
+    print(f"Latest timestamp: {timestamp}")
+
+details_path = f"{output_dir}/details/{model_name}/{timestamp}/details_{task}_{timestamp}.parquet"
+
+# Load the details
+details = load_dataset("parquet", data_files=details_path, split="train")
+
+for detail in details:
+    print(detail)
+```
+
+### Load from the HuggingFace hub
+
+```python
+from datasets import load_dataset
+
+results_org = "SaylorTwift"
+model_name = "HuggingFaceH4/zephyr-7b-beta"
+sanitized_model_name = model_name.replace("/", "__")
+task = "lighteval|gsm8k|0"
+public_run = False
+
+dataset_path = f"{results_org}/details_{sanitized_model_name}{'_private' if not public_run else ''}"
+details = load_dataset(dataset_path, task.replace("|", "_"), split="latest")
+
+for detail in details:
+    print(detail)
+```
+
+
+The detail file contains the following columns:
+- `choices`: The choices presented to the model in the case of mutlichoice tasks.
+- `gold`: The gold answer.
+- `gold_index`: The index of the gold answer in the choices list.
+- `cont_tokens`: The continuation tokens.
+- `example`: The input in text form.
+- `full_prompt`: The full prompt, that will be inputed to the model.
+- `input_tokens`: The tokens of the full prompt.
+- `instruction`: The instruction given to the model.
+- `metrics`: The metrics computed for the example.
+- `num_asked_few_shots`: The number of few shots asked to the model.
+- `num_effective_few_shots`: The number of effective few shots.
+- `padded`: Whether the input was padded.
+- `pred_logits`: The logits of the model.
+- `predictions`: The predictions of the model.
+- `specifics`: The specifics of the task.
+- `truncated`: Whether the input was truncated.
+
+
+## Example of a result file
+
+```json
+{
+  "config_general": {
+    "lighteval_sha": "203045a8431bc9b77245c9998e05fc54509ea07f",
+    "num_fewshot_seeds": 1,
+    "override_batch_size": 1,
+    "max_samples": 1,
+    "job_id": "",
+    "start_time": 620979.879320166,
+    "end_time": 621004.632108041,
+    "total_evaluation_time_secondes": "24.752787875011563",
+    "model_name": "gpt2",
+    "model_sha": "607a30d783dfa663caf39e06633721c8d4cfcd7e",
+    "model_dtype": null,
+    "model_size": "476.2 MB"
+  },
+  "results": {
+    "lighteval|gsm8k|0": {
+      "qem": 0.0,
+      "qem_stderr": 0.0,
+      "maj@8": 0.0,
+      "maj@8_stderr": 0.0
+    },
+    "all": {
+      "qem": 0.0,
+      "qem_stderr": 0.0,
+      "maj@8": 0.0,
+      "maj@8_stderr": 0.0
+    }
+  },
+  "versions": {
+    "lighteval|gsm8k|0": 0
+  },
+  "config_tasks": {
+    "lighteval|gsm8k": {
+      "name": "gsm8k",
+      "prompt_function": "gsm8k",
+      "hf_repo": "gsm8k",
+      "hf_subset": "main",
+      "metric": [
+        {
+          "metric_name": "qem",
+          "higher_is_better": true,
+          "category": "3",
+          "use_case": "5",
+          "sample_level_fn": "compute",
+          "corpus_level_fn": "mean"
+        },
+        {
+          "metric_name": "maj@8",
+          "higher_is_better": true,
+          "category": "5",
+          "use_case": "5",
+          "sample_level_fn": "compute",
+          "corpus_level_fn": "mean"
+        }
+      ],
+      "hf_avail_splits": [
+        "train",
+        "test"
+      ],
+      "evaluation_splits": [
+        "test"
+      ],
+      "few_shots_split": null,
+      "few_shots_select": "random_sampling_from_train",
+      "generation_size": 256,
+      "generation_grammar": null,
+      "stop_sequence": [
+        "Question="
+      ],
+      "output_regex": null,
+      "num_samples": null,
+      "frozen": false,
+      "suite": [
+        "lighteval"
+      ],
+      "original_num_docs": 1319,
+      "effective_num_docs": 1,
+      "trust_dataset": true,
+      "must_remove_duplicate_docs": null,
+      "version": 0
+    }
+  },
+  "summary_tasks": {
+    "lighteval|gsm8k|0": {
+      "hashes": {
+        "hash_examples": "8517d5bf7e880086",
+        "hash_full_prompts": "8517d5bf7e880086",
+        "hash_input_tokens": "29916e7afe5cb51d",
+        "hash_cont_tokens": "37f91ce23ef6d435"
+      },
+      "truncated": 2,
+      "non_truncated": 0,
+      "padded": 0,
+      "non_padded": 2,
+      "effective_few_shots": 0.0,
+      "num_truncated_few_shots": 0
+    }
+  },
+  "summary_general": {
+    "hashes": {
+      "hash_examples": "5f383c395f01096e",
+      "hash_full_prompts": "5f383c395f01096e",
+      "hash_input_tokens": "ac933feb14f96d7b",
+      "hash_cont_tokens": "9d03fb26f8da7277"
+    },
+    "truncated": 2,
+    "non_truncated": 0,
+    "padded": 0,
+    "non_padded": 2,
+    "num_truncated_few_shots": 0
+  }
+}
+```
diff --git a/docs/source/use-vllm-as-backend.mdx b/docs/source/use-vllm-as-backend.mdx
new file mode 100644
index 000000000..153ff659f
--- /dev/null
+++ b/docs/source/use-vllm-as-backend.mdx
@@ -0,0 +1,53 @@
+# Use VLLM as backend
+
+Lighteval allows you to use `vllm` as backend allowing great speedups.
+To use, simply change the `model_args` to reflect the arguments you want to pass to vllm.
+
+```bash
+lighteval accelerate \
+    --model_args="vllm,pretrained=HuggingFaceH4/zephyr-7b-beta,dtype=float16" \
+    --tasks "leaderboard|truthfulqa:mc|0|0" \
+    --output_dir="./evals/"
+```
+
+`vllm` is able to distribute the model across multiple GPUs using data
+parallelism, pipeline parallelism or tensor parallelism.
+You can choose the parallelism method by setting in the the `model_args`.
+
+For example if you have 4 GPUs you can split it across using `tensor_parallelism`:
+
+```bash
+export VLLM_WORKER_MULTIPROC_METHOD=spawn && lighteval accelerate \
+    --model_args="vllm,pretrained=HuggingFaceH4/zephyr-7b-beta,dtype=float16,tensor_parallel_size=4" \
+    --tasks "leaderboard|truthfulqa:mc|0|0" \
+    --output_dir="./evals/"
+```
+
+Or, if your model fits on a single GPU, you can use `data_parallelism` to speed up the evaluation:
+
+```bash
+lighteval accelerate \
+    --model_args="vllm,pretrained=HuggingFaceH4/zephyr-7b-beta,dtype=float16,data_parallel_size=4" \
+    --tasks "leaderboard|truthfulqa:mc|0|0" \
+    --output_dir="./evals/"
+```
+
+Available arguments for `vllm` can be found in the `VLLMModelConfig`:
+
+- **pretrained** (str): HuggingFace Hub model ID name or the path to a pre-trained model to load.
+- **gpu_memory_utilisation** (float): The fraction of GPU memory to use.
+- **revision** (str): The revision of the model.
+- **dtype** (str, None): The data type to use for the model.
+- **tensor_parallel_size** (int): The number of tensor parallel units to use.
+- **data_parallel_size** (int): The number of data parallel units to use.
+- **max_model_length** (int): The maximum length of the model.
+- **swap_space** (int): The CPU swap space size (GiB) per GPU.
+- **seed** (int): The seed to use for the model.
+- **trust_remote_code** (bool): Whether to trust remote code during model loading.
+- **add_special_tokens** (bool): Whether to add special tokens to the input sequences.
+- **multichoice_continuations_start_space** (bool): Whether to add a space at the start of each continuation in multichoice generation.
+
+> [!WARNING]
+> In the case of OOM issues, you might need to reduce the context size of the
+> model as well as reduce the `gpu_memory_utilisation` parameter.
+
diff --git a/docs/source/using-the-python-api.mdx b/docs/source/using-the-python-api.mdx
new file mode 100644
index 000000000..82238c7f1
--- /dev/null
+++ b/docs/source/using-the-python-api.mdx
@@ -0,0 +1,62 @@
+# Using the Python API
+
+Lighteval can be used from a custom python script. To evaluate a model you will
+need to setup an `evaluation_tracker`, `pipeline_parameters`, `model_config`
+and a `pipeline`.
+
+After that, simply run the pipeline and save the results.
+
+
+```python
+import lighteval
+from lighteval.logging.evaluation_tracker import EvaluationTracker
+from lighteval.models.model_config import VLLMModelConfig
+from lighteval.pipeline import ParallelismManager, Pipeline, PipelineParameters
+from lighteval.utils.utils import EnvConfig
+from lighteval.utils.imports import is_accelerate_available
+
+if is_accelerate_available():
+    from accelerate import Accelerator, InitProcessGroupKwargs
+    accelerator = Accelerator(kwargs_handlers=[InitProcessGroupKwargs(timeout=timedelta(seconds=3000))])
+else:
+    accelerator = None
+
+def main():
+    evaluation_tracker = EvaluationTracker(
+        output_dir="./results",
+        save_details=True,
+        push_to_hub=True,
+        hub_results_org="your user name",
+    )
+
+    pipeline_params = PipelineParameters(
+        launcher_type=ParallelismManager.ACCELERATE,
+        env_config=EnvConfig(cache_dir="tmp/"),
+        # Remove the 2 parameters below once your configuration is tested
+        override_batch_size=1,
+        max_samples=10 
+    )
+
+    model_config = VLLMModelConfig(
+            pretrained="HuggingFaceH4/zephyr-7b-beta",
+            dtype="float16",
+            use_chat_template=True,
+    )
+
+    task = "helm|mmlu|5|1"
+
+    pipeline = Pipeline(
+        tasks=task,
+        pipeline_parameters=pipeline_params,
+        evaluation_tracker=evaluation_tracker,
+        model_config=model_config,
+        custom_task_directory=None, # if using a custom task
+    )
+
+    pipeline.evaluate()
+    pipeline.save_and_push_results()
+    pipeline.show_results()
+
+if __name__ == "__main__":
+    main()
+```
diff --git a/pyproject.toml b/pyproject.toml
index e736be66a..a779ebf4c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -92,6 +92,7 @@ vllm = ["vllm", "ray", "more_itertools"]
 quality = ["ruff==v0.2.2","pre-commit"]
 tests = ["pytest==7.4.0"]
 dev = ["lighteval[accelerate,quality,tests,multilingual]"]
+docs = ["hf-doc-builder", "watchdog"]
 extended_tasks = [
   "langdetect", # ifeval
   "openai", # llm as a judge using openai models