From 0977fde60ba3564cf34c53606a53962d81eef7b0 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Wed, 18 Dec 2024 13:47:14 +0100
Subject: [PATCH 1/7] Upgrade deprecated GH Action cache@v2 (#456)

---
 .github/workflows/tests.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index 7b980508..c0b06d36 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -28,7 +28,7 @@ jobs:
        run: |
          pip install -e .[dev,extended_tasks,multilingual]
      - name: Get cached files
-       uses: actions/cache@v2
+       uses: actions/cache@v4
        id: get-cache
        with:
          path: "cache"
@@ -41,7 +41,7 @@ jobs:
        run: | # PYTHONPATH="${PYTHONPATH}:src" HF_DATASETS_CACHE="cache/datasets" HF_HOME="cache/models"
         python -m pytest --disable-pytest-warnings
      - name: Write cache
-       uses: actions/cache@v2
+       uses: actions/cache@v4
        with:
          path: "cache"
          key: test-cache-HF

From c45e3c8b96b0f5e99eab4d1c152198792be24251 Mon Sep 17 00:00:00 2001
From: Aoi <82735346+ryan-minato@users.noreply.github.com>
Date: Thu, 19 Dec 2024 21:31:26 +0900
Subject: [PATCH 2/7] fix: CACHE_DIR Default Value in Accelerate Pipeline
 (#461)

---
 src/lighteval/main_accelerate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lighteval/main_accelerate.py b/src/lighteval/main_accelerate.py
index a0a01abe..2dd78f44 100644
--- a/src/lighteval/main_accelerate.py
+++ b/src/lighteval/main_accelerate.py
@@ -31,7 +31,7 @@
 logger = logging.getLogger(__name__)
 
 TOKEN = os.getenv("HF_TOKEN")
-CACHE_DIR: str = os.getenv("HF_HOME", "/scratch")
+CACHE_DIR: str = os.getenv("HF_HOME")
 
 HELP_PANEL_NAME_1 = "Common Parameters"
 HELP_PANEL_NAME_2 = "Logging Parameters"

From 988fa94db7cdd539f2a7a4971ed67890b0da4184 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Thu, 19 Dec 2024 14:55:05 +0100
Subject: [PATCH 3/7] Add EvaluationTracker to docs and fix its docstring
 (#464)

* Fix definition of public in docstring

* Fix push_to_tensorboard param name in docstring

* Fix docstring style

* Add EvaluationTracker to docs

* Fix docstring style

* Move docstring to class header

* Add attributes to docstring

* Fix style

* Fix style

* Fix style

* Fix style

* Fix style

* Fix style

* Fix internal links in docstring
---
 docs/source/package_reference/logging.mdx   |  5 +-
 src/lighteval/logging/evaluation_tracker.py | 51 +++++++++++----------
 2 files changed, 31 insertions(+), 25 deletions(-)

diff --git a/docs/source/package_reference/logging.mdx b/docs/source/package_reference/logging.mdx
index 9fd01154..9102755c 100644
--- a/docs/source/package_reference/logging.mdx
+++ b/docs/source/package_reference/logging.mdx
@@ -1,4 +1,7 @@
-# Loggers
+# Logging
+
+## EvaluationTracker
+[[autodoc]] logging.evaluation_tracker.EvaluationTracker
 
 ## GeneralConfigLogger
 [[autodoc]] logging.info_loggers.GeneralConfigLogger
diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py
index 01705534..8cc8c09e 100644
--- a/src/lighteval/logging/evaluation_tracker.py
+++ b/src/lighteval/logging/evaluation_tracker.py
@@ -82,16 +82,35 @@ def default(self, o):
 
 
 class EvaluationTracker:
-    """
-    Keeps track of the overall evaluation process and relevant informations.
+    """Keeps track of the overall evaluation process and relevant information.
 
-    The [`EvaluationTracker`] contains specific loggers for experiments details
-    ([`DetailsLogger`]), metrics ([`MetricsLogger`]), task versions
-    ([`VersionsLogger`]) as well as for the general configurations of both the
-    specific task ([`TaskConfigLogger`]) and overall evaluation run
-    ([`GeneralConfigLogger`]).  It compiles the data from these loggers and
+    The [`~logging.evaluation_tracker.EvaluationTracker`] contains specific loggers for experiments details
+    ([`~logging.evaluation_tracker.DetailsLogger`]), metrics ([`~logging.evaluation_tracker.MetricsLogger`]), task versions
+    ([`~logging.evaluation_tracker.VersionsLogger`]) as well as for the general configurations of both the
+    specific task ([`~logging.evaluation_tracker.TaskConfigLogger`]) and overall evaluation run
+    ([`~logging.evaluation_tracker.GeneralConfigLogger`]).  It compiles the data from these loggers and
     writes it to files, which can be published to the Hugging Face hub if
     requested.
+
+    Args:
+        output_dir (`str`): Local folder path where you want results to be saved.
+        save_details (`bool`, defaults to True): If True, details are saved to the `output_dir`.
+        push_to_hub (`bool`, defaults to False): If True, details are pushed to the hub.
+            Results are pushed to `{hub_results_org}/details__{sanitized model_name}` for the model `model_name`, a public dataset,
+            if `public` is True else `{hub_results_org}/details__{sanitized model_name}_private`, a private dataset.
+        push_to_tensorboard (`bool`, defaults to False): If True, will create and push the results for a tensorboard folder on the hub.
+        hub_results_org (`str`, *optional*): The organisation to push the results to.
+            See more details about the datasets organisation in [`EvaluationTracker.save`].
+        tensorboard_metric_prefix (`str`, defaults to "eval"): Prefix for the metrics in the tensorboard logs.
+        public (`bool`, defaults to False): If True, results and details are pushed to public orgs.
+        nanotron_run_info ([`~nanotron.config.GeneralArgs`], *optional*): Reference to information about Nanotron models runs.
+
+    **Attributes**:
+        - **details_logger** ([`~logging.info_loggers.DetailsLogger`]) -- Logger for experiment details.
+        - **metrics_logger** ([`~logging.info_loggers.MetricsLogger`]) -- Logger for experiment metrics.
+        - **versions_logger** ([`~logging.info_loggers.VersionsLogger`]) -- Logger for task versions.
+        - **general_config_logger** ([`~logging.info_loggers.GeneralConfigLogger`]) -- Logger for general configuration.
+        - **task_config_logger** ([`~logging.info_loggers.TaskConfigLogger`]) -- Logger for task configuration.
     """
 
     def __init__(
@@ -105,23 +124,7 @@ def __init__(
         public: bool = False,
         nanotron_run_info: "GeneralArgs" = None,
     ) -> None:
-        """
-        Creates all the necessary loggers for evaluation tracking.
-
-        Args:
-            output_dir (str): Local folder path where you want results to be saved
-            save_details (bool): If True, details are saved to the output_dir
-            push_to_hub (bool): If True, details are pushed to the hub.
-                Results are pushed to `{hub_results_org}/details__{sanitized model_name}` for the model `model_name`, a public dataset,
-                if `public` is True else `{hub_results_org}/details__{sanitized model_name}_private`, a private dataset.
-            push_results_to_tensorboard (bool): If True, will create and push the results for a tensorboard folder on the hub
-            hub_results_org (str): The organisation to push the results to. See
-                more details about the datasets organisation in
-                [`EvaluationTracker.save`]
-            tensorboard_metric_prefix (str): Prefix for the metrics in the tensorboard logs
-            public (bool): If True, results and details are pushed in private orgs
-            nanotron_run_info (GeneralArgs): Reference to informations about Nanotron models runs
-        """
+        """Creates all the necessary loggers for evaluation tracking."""
         self.details_logger = DetailsLogger()
         self.metrics_logger = MetricsLogger()
         self.versions_logger = VersionsLogger()

From a1c610daab3f796a273ab56f78b5f5fe9614b8aa Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Thu, 19 Dec 2024 16:03:44 +0100
Subject: [PATCH 4/7] Remove unnecessary deepcopy in evaluation_tracker (#459)

* Remove unnecessary deepcopy in evaluation_tracker

* Fix style
---
 src/lighteval/logging/evaluation_tracker.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py
index 8cc8c09e..6cad9189 100644
--- a/src/lighteval/logging/evaluation_tracker.py
+++ b/src/lighteval/logging/evaluation_tracker.py
@@ -20,7 +20,6 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-import copy
 import json
 import logging
 import os
@@ -156,8 +155,7 @@ def save(self) -> None:
         date_id = datetime.now().isoformat().replace(":", "-")
 
         # We first prepare data to save
-        config_general = copy.deepcopy(self.general_config_logger)
-        config_general = asdict(config_general)
+        config_general = asdict(self.general_config_logger)
         # We remove the config from logging, which contains context/accelerator objects
         config_general.pop("config")
 

From fbca143616c37f4336f80768cc4bdddb97bf3b06 Mon Sep 17 00:00:00 2001
From: 3 a l i <58257628+alielfilali01@users.noreply.github.com>
Date: Fri, 20 Dec 2024 22:12:34 +0400
Subject: [PATCH 5/7] Update arabic_evals.py: Fix custom arabic tasks [2nd
 attempt] (#444)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix alghafa prompt function by explicitly determining the list of choices based on task_name.
(Not all subsets of AlGhafa Native share same columns)

---------

Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com>
Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com>
---
 .github/workflows/trufflehog.yml                     |  1 -
 community_tasks/arabic_evals.py                      |  7 ++-----
 docs/source/adding-a-new-metric.mdx                  |  1 -
 .../contributing-to-multilingual-evaluations.mdx     | 12 ++++++------
 docs/source/using-the-python-api.mdx                 |  2 +-
 5 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/trufflehog.yml b/.github/workflows/trufflehog.yml
index 8ac08ad6..ecdca01d 100644
--- a/.github/workflows/trufflehog.yml
+++ b/.github/workflows/trufflehog.yml
@@ -16,4 +16,3 @@ jobs:
         fetch-depth: 0
     - name: Secret Scanning
       uses: trufflesecurity/trufflehog@main
-
diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py
index 4408f22f..86ab69e2 100644
--- a/community_tasks/arabic_evals.py
+++ b/community_tasks/arabic_evals.py
@@ -86,7 +86,6 @@ def arabic_mmlu_pfn(line, task_name: str = None):
         choices=valid_keys_arabic,  # Return only valid choices (Arabic keys)
         gold_index=answer_index,  # Correct index in the valid Arabic keys
         instruction=instruction,
-        target_for_fewshot_sorting=valid_keys_arabic[answer_index],  # Correct answer in Arabic form
     )
 
 
@@ -149,7 +148,6 @@ def arabic_mmlu_ht_pfn(line, task_name: str = None):
         choices=[str(i) for i in range(1, len(choices) + 1)],  # List of strings instead of ints
         gold_index=answer_index,
         instruction=instruction,
-        target_for_fewshot_sorting=str(answer_index),  # Assuming it's sorted based on the number
     )
 
 
@@ -328,7 +326,6 @@ def aratrust_pfn(line, task_name: str = None):
         choices=LETTER_INDICES_AR[:3],
         gold_index=answer_index,
         instruction=instruction,
-        target_for_fewshot_sorting=LETTER_INDICES_AR[answer_index],
     )
 
 
@@ -413,7 +410,8 @@ def arabic_exams_pfn(line, task_name: str = None):
 def alghafa_pfn(line, task_name: str = None):
     question = line["query"]
     answer_index = int(line["label"])
-    choices = [line[key] for key in ["sol1", "sol2", "sol3", "sol4"]]
+    allowed_keys = [f"sol{i}" for i in range(1, 6)]
+    choices = [line[key] for key in allowed_keys if key in line]
 
     instruction = "الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح\n\n"
     query = f"{instruction}السؤال: {question}\n"
@@ -802,7 +800,6 @@ def madinah_qa_pfn(line, task_name: str = None):
         choices=choices,
         gold_index=answer_index,  # Correct index in the valid keys
         instruction=instruction,
-        target_for_fewshot_sorting=valid_keys_latin[answer_index],  # Correct answer in Latin form
     )
 
 
diff --git a/docs/source/adding-a-new-metric.mdx b/docs/source/adding-a-new-metric.mdx
index 35fc975f..6433d588 100644
--- a/docs/source/adding-a-new-metric.mdx
+++ b/docs/source/adding-a-new-metric.mdx
@@ -92,4 +92,3 @@ if __name__ == "__main__":
 
 You can then give your custom metric to lighteval by using `--custom-tasks
 path_to_your_file` when launching it.
-
diff --git a/docs/source/contributing-to-multilingual-evaluations.mdx b/docs/source/contributing-to-multilingual-evaluations.mdx
index 0d0855d7..4db1c935 100644
--- a/docs/source/contributing-to-multilingual-evaluations.mdx
+++ b/docs/source/contributing-to-multilingual-evaluations.mdx
@@ -8,7 +8,7 @@ We welcome translations in your language!
 
 To contribute, you'll need to
 1. Open the [translation_literals](https://github.com/huggingface/lighteval/blob/main/src/lighteval/tasks/templates/utils/translation_literals.py) file
-2. Edit the file to add or expand the literal for your language of interest. 
+2. Edit the file to add or expand the literal for your language of interest.
 
 ```python
     Language.ENGLISH: TranslationLiterals(
@@ -42,7 +42,7 @@ To contribute, you'll need to
 
 ## Contributing a new multilingual task
 
-You should first read our guide on [adding a custom task](adding-a-custom-task), to better understand the different parameters we use. 
+You should first read our guide on [adding a custom task](adding-a-custom-task), to better understand the different parameters we use.
 
 Then, you should take a look at the current [multilingual tasks](https://github.com/huggingface/lighteval/blob/main/src/lighteval/tasks/multilingual/tasks.py) file, to understand how they are defined. For multilingual evaluations the `prompt_function` should be implemented by language-adapted template. The template will take care of correct formatting, correct and consistent usage of language adjusted prompt anchors (e.g Question/Answer) and punctuation.
 
@@ -58,7 +58,7 @@ your_tasks = [
     LightevalTaskConfig(
         # Name of your evaluation
         name=f"evalname_{language.value}_{formulation.name.lower()}",
-        # The evaluation is community contributed 
+        # The evaluation is community contributed
         suite=["community"],
         # This will automatically get the correct metrics for your chosen formulation
         metric=get_metrics_for_formulation(
@@ -72,7 +72,7 @@ your_tasks = [
         # In this function, you choose which template to follow and for which language and formulation
         prompt_function=get_template_prompt_function(
             language=language,
-            # then use the adapter to define the mapping between the 
+            # then use the adapter to define the mapping between the
             # keys of the template (left), and the keys of your dataset
             # (right)
             # To know which template keys are required and available,
@@ -83,9 +83,9 @@ your_tasks = [
             },
             formulation=formulation,
         ),
-        # You can also add specific filters to remove irrelevant samples 
+        # You can also add specific filters to remove irrelevant samples
         hf_filter=lambda line: line["label"] in <condition>,
-        # You then select your huggingface dataset as well as 
+        # You then select your huggingface dataset as well as
         # the splits available for evaluation
         hf_repo=<dataset>,
         hf_subset=<subset>,
diff --git a/docs/source/using-the-python-api.mdx b/docs/source/using-the-python-api.mdx
index 8c44050f..583da5f5 100644
--- a/docs/source/using-the-python-api.mdx
+++ b/docs/source/using-the-python-api.mdx
@@ -35,7 +35,7 @@ def main():
         env_config=EnvConfig(cache_dir="tmp/"),
         # Remove the 2 parameters below once your configuration is tested
         override_batch_size=1,
-        max_samples=10 
+        max_samples=10
     )
 
     model_config = VLLMModelConfig(

From 264f3f89e3847e9b61dc24242d3dc4ae476ecc5b Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Fri, 20 Dec 2024 19:14:26 +0100
Subject: [PATCH 6/7] Fix warning about precedence of custom tasks over default
 ones in registry (#466)

* Fix precedence of default tasks over custom ones in registry

* Revert "Fix precedence of default tasks over custom ones in registry"

This reverts commit 8125ea230156f4c36cb21e15cfe8c74fec47a7a4.

* Fix comment/warning about precedence of custom over default tasks
---
 src/lighteval/tasks/registry.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py
index 69532c09..834e8170 100644
--- a/src/lighteval/tasks/registry.py
+++ b/src/lighteval/tasks/registry.py
@@ -148,10 +148,10 @@ def task_registry(self):
         intersection = set(default_tasks_registry.keys()).intersection(set(custom_tasks_registry.keys()))
         if len(intersection) > 0:
             logger.warning(
-                f"Following tasks ({intersection}) exists both in the default and custom tasks. Will use the default ones on conflict."
+                f"Following tasks ({intersection}) exists both in the default and custom tasks. Will use the custom ones on conflict."
             )
 
-        # Defaults tasks should overwrite custom tasks
+        # Custom tasks overwrite defaults tasks
         return {**default_tasks_registry, **custom_tasks_registry}
 
     @property

From 8568e72566305a9fdceef52b72b5e208f1c9401e Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Sat, 21 Dec 2024 06:47:54 +0100
Subject: [PATCH 7/7] Checkout PR merge commit for CI tests (#468)

---
 .github/workflows/tests.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index c0b06d36..950a7597 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -18,7 +18,6 @@ jobs:
        uses: actions/checkout@v3
        with:
         lfs: 'true'
-        ref: ${{ github.event.pull_request.head.sha }} # we want to test against our branch not against a merge commit
      - name: Setup Python environment
        uses: actions/setup-python@v4
        with: