From 0977fde60ba3564cf34c53606a53962d81eef7b0 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Wed, 18 Dec 2024 13:47:14 +0100 Subject: [PATCH 1/7] Upgrade deprecated GH Action cache@v2 (#456) --- .github/workflows/tests.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index 7b980508..c0b06d36 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -28,7 +28,7 @@ jobs: run: | pip install -e .[dev,extended_tasks,multilingual] - name: Get cached files - uses: actions/cache@v2 + uses: actions/cache@v4 id: get-cache with: path: "cache" @@ -41,7 +41,7 @@ jobs: run: | # PYTHONPATH="${PYTHONPATH}:src" HF_DATASETS_CACHE="cache/datasets" HF_HOME="cache/models" python -m pytest --disable-pytest-warnings - name: Write cache - uses: actions/cache@v2 + uses: actions/cache@v4 with: path: "cache" key: test-cache-HF From c45e3c8b96b0f5e99eab4d1c152198792be24251 Mon Sep 17 00:00:00 2001 From: Aoi <82735346+ryan-minato@users.noreply.github.com> Date: Thu, 19 Dec 2024 21:31:26 +0900 Subject: [PATCH 2/7] fix: CACHE_DIR Default Value in Accelerate Pipeline (#461) --- src/lighteval/main_accelerate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lighteval/main_accelerate.py b/src/lighteval/main_accelerate.py index a0a01abe..2dd78f44 100644 --- a/src/lighteval/main_accelerate.py +++ b/src/lighteval/main_accelerate.py @@ -31,7 +31,7 @@ logger = logging.getLogger(__name__) TOKEN = os.getenv("HF_TOKEN") -CACHE_DIR: str = os.getenv("HF_HOME", "/scratch") +CACHE_DIR: str = os.getenv("HF_HOME") HELP_PANEL_NAME_1 = "Common Parameters" HELP_PANEL_NAME_2 = "Logging Parameters" From 988fa94db7cdd539f2a7a4971ed67890b0da4184 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 19 Dec 2024 14:55:05 +0100 Subject: [PATCH 3/7] Add EvaluationTracker to docs and fix its docstring (#464) * Fix definition of public in docstring * Fix push_to_tensorboard param name in docstring * Fix docstring style * Add EvaluationTracker to docs * Fix docstring style * Move docstring to class header * Add attributes to docstring * Fix style * Fix style * Fix style * Fix style * Fix style * Fix style * Fix internal links in docstring --- docs/source/package_reference/logging.mdx | 5 +- src/lighteval/logging/evaluation_tracker.py | 51 +++++++++++---------- 2 files changed, 31 insertions(+), 25 deletions(-) diff --git a/docs/source/package_reference/logging.mdx b/docs/source/package_reference/logging.mdx index 9fd01154..9102755c 100644 --- a/docs/source/package_reference/logging.mdx +++ b/docs/source/package_reference/logging.mdx @@ -1,4 +1,7 @@ -# Loggers +# Logging + +## EvaluationTracker +[[autodoc]] logging.evaluation_tracker.EvaluationTracker ## GeneralConfigLogger [[autodoc]] logging.info_loggers.GeneralConfigLogger diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py index 01705534..8cc8c09e 100644 --- a/src/lighteval/logging/evaluation_tracker.py +++ b/src/lighteval/logging/evaluation_tracker.py @@ -82,16 +82,35 @@ def default(self, o): class EvaluationTracker: - """ - Keeps track of the overall evaluation process and relevant informations. + """Keeps track of the overall evaluation process and relevant information. - The [`EvaluationTracker`] contains specific loggers for experiments details - ([`DetailsLogger`]), metrics ([`MetricsLogger`]), task versions - ([`VersionsLogger`]) as well as for the general configurations of both the - specific task ([`TaskConfigLogger`]) and overall evaluation run - ([`GeneralConfigLogger`]). It compiles the data from these loggers and + The [`~logging.evaluation_tracker.EvaluationTracker`] contains specific loggers for experiments details + ([`~logging.evaluation_tracker.DetailsLogger`]), metrics ([`~logging.evaluation_tracker.MetricsLogger`]), task versions + ([`~logging.evaluation_tracker.VersionsLogger`]) as well as for the general configurations of both the + specific task ([`~logging.evaluation_tracker.TaskConfigLogger`]) and overall evaluation run + ([`~logging.evaluation_tracker.GeneralConfigLogger`]). It compiles the data from these loggers and writes it to files, which can be published to the Hugging Face hub if requested. + + Args: + output_dir (`str`): Local folder path where you want results to be saved. + save_details (`bool`, defaults to True): If True, details are saved to the `output_dir`. + push_to_hub (`bool`, defaults to False): If True, details are pushed to the hub. + Results are pushed to `{hub_results_org}/details__{sanitized model_name}` for the model `model_name`, a public dataset, + if `public` is True else `{hub_results_org}/details__{sanitized model_name}_private`, a private dataset. + push_to_tensorboard (`bool`, defaults to False): If True, will create and push the results for a tensorboard folder on the hub. + hub_results_org (`str`, *optional*): The organisation to push the results to. + See more details about the datasets organisation in [`EvaluationTracker.save`]. + tensorboard_metric_prefix (`str`, defaults to "eval"): Prefix for the metrics in the tensorboard logs. + public (`bool`, defaults to False): If True, results and details are pushed to public orgs. + nanotron_run_info ([`~nanotron.config.GeneralArgs`], *optional*): Reference to information about Nanotron models runs. + + **Attributes**: + - **details_logger** ([`~logging.info_loggers.DetailsLogger`]) -- Logger for experiment details. + - **metrics_logger** ([`~logging.info_loggers.MetricsLogger`]) -- Logger for experiment metrics. + - **versions_logger** ([`~logging.info_loggers.VersionsLogger`]) -- Logger for task versions. + - **general_config_logger** ([`~logging.info_loggers.GeneralConfigLogger`]) -- Logger for general configuration. + - **task_config_logger** ([`~logging.info_loggers.TaskConfigLogger`]) -- Logger for task configuration. """ def __init__( @@ -105,23 +124,7 @@ def __init__( public: bool = False, nanotron_run_info: "GeneralArgs" = None, ) -> None: - """ - Creates all the necessary loggers for evaluation tracking. - - Args: - output_dir (str): Local folder path where you want results to be saved - save_details (bool): If True, details are saved to the output_dir - push_to_hub (bool): If True, details are pushed to the hub. - Results are pushed to `{hub_results_org}/details__{sanitized model_name}` for the model `model_name`, a public dataset, - if `public` is True else `{hub_results_org}/details__{sanitized model_name}_private`, a private dataset. - push_results_to_tensorboard (bool): If True, will create and push the results for a tensorboard folder on the hub - hub_results_org (str): The organisation to push the results to. See - more details about the datasets organisation in - [`EvaluationTracker.save`] - tensorboard_metric_prefix (str): Prefix for the metrics in the tensorboard logs - public (bool): If True, results and details are pushed in private orgs - nanotron_run_info (GeneralArgs): Reference to informations about Nanotron models runs - """ + """Creates all the necessary loggers for evaluation tracking.""" self.details_logger = DetailsLogger() self.metrics_logger = MetricsLogger() self.versions_logger = VersionsLogger() From a1c610daab3f796a273ab56f78b5f5fe9614b8aa Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 19 Dec 2024 16:03:44 +0100 Subject: [PATCH 4/7] Remove unnecessary deepcopy in evaluation_tracker (#459) * Remove unnecessary deepcopy in evaluation_tracker * Fix style --- src/lighteval/logging/evaluation_tracker.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py index 8cc8c09e..6cad9189 100644 --- a/src/lighteval/logging/evaluation_tracker.py +++ b/src/lighteval/logging/evaluation_tracker.py @@ -20,7 +20,6 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -import copy import json import logging import os @@ -156,8 +155,7 @@ def save(self) -> None: date_id = datetime.now().isoformat().replace(":", "-") # We first prepare data to save - config_general = copy.deepcopy(self.general_config_logger) - config_general = asdict(config_general) + config_general = asdict(self.general_config_logger) # We remove the config from logging, which contains context/accelerator objects config_general.pop("config") From fbca143616c37f4336f80768cc4bdddb97bf3b06 Mon Sep 17 00:00:00 2001 From: 3 a l i <58257628+alielfilali01@users.noreply.github.com> Date: Fri, 20 Dec 2024 22:12:34 +0400 Subject: [PATCH 5/7] Update arabic_evals.py: Fix custom arabic tasks [2nd attempt] (#444) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix alghafa prompt function by explicitly determining the list of choices based on task_name. (Not all subsets of AlGhafa Native share same columns) --------- Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com> --- .github/workflows/trufflehog.yml | 1 - community_tasks/arabic_evals.py | 7 ++----- docs/source/adding-a-new-metric.mdx | 1 - .../contributing-to-multilingual-evaluations.mdx | 12 ++++++------ docs/source/using-the-python-api.mdx | 2 +- 5 files changed, 9 insertions(+), 14 deletions(-) diff --git a/.github/workflows/trufflehog.yml b/.github/workflows/trufflehog.yml index 8ac08ad6..ecdca01d 100644 --- a/.github/workflows/trufflehog.yml +++ b/.github/workflows/trufflehog.yml @@ -16,4 +16,3 @@ jobs: fetch-depth: 0 - name: Secret Scanning uses: trufflesecurity/trufflehog@main - diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py index 4408f22f..86ab69e2 100644 --- a/community_tasks/arabic_evals.py +++ b/community_tasks/arabic_evals.py @@ -86,7 +86,6 @@ def arabic_mmlu_pfn(line, task_name: str = None): choices=valid_keys_arabic, # Return only valid choices (Arabic keys) gold_index=answer_index, # Correct index in the valid Arabic keys instruction=instruction, - target_for_fewshot_sorting=valid_keys_arabic[answer_index], # Correct answer in Arabic form ) @@ -149,7 +148,6 @@ def arabic_mmlu_ht_pfn(line, task_name: str = None): choices=[str(i) for i in range(1, len(choices) + 1)], # List of strings instead of ints gold_index=answer_index, instruction=instruction, - target_for_fewshot_sorting=str(answer_index), # Assuming it's sorted based on the number ) @@ -328,7 +326,6 @@ def aratrust_pfn(line, task_name: str = None): choices=LETTER_INDICES_AR[:3], gold_index=answer_index, instruction=instruction, - target_for_fewshot_sorting=LETTER_INDICES_AR[answer_index], ) @@ -413,7 +410,8 @@ def arabic_exams_pfn(line, task_name: str = None): def alghafa_pfn(line, task_name: str = None): question = line["query"] answer_index = int(line["label"]) - choices = [line[key] for key in ["sol1", "sol2", "sol3", "sol4"]] + allowed_keys = [f"sol{i}" for i in range(1, 6)] + choices = [line[key] for key in allowed_keys if key in line] instruction = "الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح\n\n" query = f"{instruction}السؤال: {question}\n" @@ -802,7 +800,6 @@ def madinah_qa_pfn(line, task_name: str = None): choices=choices, gold_index=answer_index, # Correct index in the valid keys instruction=instruction, - target_for_fewshot_sorting=valid_keys_latin[answer_index], # Correct answer in Latin form ) diff --git a/docs/source/adding-a-new-metric.mdx b/docs/source/adding-a-new-metric.mdx index 35fc975f..6433d588 100644 --- a/docs/source/adding-a-new-metric.mdx +++ b/docs/source/adding-a-new-metric.mdx @@ -92,4 +92,3 @@ if __name__ == "__main__": You can then give your custom metric to lighteval by using `--custom-tasks path_to_your_file` when launching it. - diff --git a/docs/source/contributing-to-multilingual-evaluations.mdx b/docs/source/contributing-to-multilingual-evaluations.mdx index 0d0855d7..4db1c935 100644 --- a/docs/source/contributing-to-multilingual-evaluations.mdx +++ b/docs/source/contributing-to-multilingual-evaluations.mdx @@ -8,7 +8,7 @@ We welcome translations in your language! To contribute, you'll need to 1. Open the [translation_literals](https://github.com/huggingface/lighteval/blob/main/src/lighteval/tasks/templates/utils/translation_literals.py) file -2. Edit the file to add or expand the literal for your language of interest. +2. Edit the file to add or expand the literal for your language of interest. ```python Language.ENGLISH: TranslationLiterals( @@ -42,7 +42,7 @@ To contribute, you'll need to ## Contributing a new multilingual task -You should first read our guide on [adding a custom task](adding-a-custom-task), to better understand the different parameters we use. +You should first read our guide on [adding a custom task](adding-a-custom-task), to better understand the different parameters we use. Then, you should take a look at the current [multilingual tasks](https://github.com/huggingface/lighteval/blob/main/src/lighteval/tasks/multilingual/tasks.py) file, to understand how they are defined. For multilingual evaluations the `prompt_function` should be implemented by language-adapted template. The template will take care of correct formatting, correct and consistent usage of language adjusted prompt anchors (e.g Question/Answer) and punctuation. @@ -58,7 +58,7 @@ your_tasks = [ LightevalTaskConfig( # Name of your evaluation name=f"evalname_{language.value}_{formulation.name.lower()}", - # The evaluation is community contributed + # The evaluation is community contributed suite=["community"], # This will automatically get the correct metrics for your chosen formulation metric=get_metrics_for_formulation( @@ -72,7 +72,7 @@ your_tasks = [ # In this function, you choose which template to follow and for which language and formulation prompt_function=get_template_prompt_function( language=language, - # then use the adapter to define the mapping between the + # then use the adapter to define the mapping between the # keys of the template (left), and the keys of your dataset # (right) # To know which template keys are required and available, @@ -83,9 +83,9 @@ your_tasks = [ }, formulation=formulation, ), - # You can also add specific filters to remove irrelevant samples + # You can also add specific filters to remove irrelevant samples hf_filter=lambda line: line["label"] in , - # You then select your huggingface dataset as well as + # You then select your huggingface dataset as well as # the splits available for evaluation hf_repo=, hf_subset=, diff --git a/docs/source/using-the-python-api.mdx b/docs/source/using-the-python-api.mdx index 8c44050f..583da5f5 100644 --- a/docs/source/using-the-python-api.mdx +++ b/docs/source/using-the-python-api.mdx @@ -35,7 +35,7 @@ def main(): env_config=EnvConfig(cache_dir="tmp/"), # Remove the 2 parameters below once your configuration is tested override_batch_size=1, - max_samples=10 + max_samples=10 ) model_config = VLLMModelConfig( From 264f3f89e3847e9b61dc24242d3dc4ae476ecc5b Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Fri, 20 Dec 2024 19:14:26 +0100 Subject: [PATCH 6/7] Fix warning about precedence of custom tasks over default ones in registry (#466) * Fix precedence of default tasks over custom ones in registry * Revert "Fix precedence of default tasks over custom ones in registry" This reverts commit 8125ea230156f4c36cb21e15cfe8c74fec47a7a4. * Fix comment/warning about precedence of custom over default tasks --- src/lighteval/tasks/registry.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py index 69532c09..834e8170 100644 --- a/src/lighteval/tasks/registry.py +++ b/src/lighteval/tasks/registry.py @@ -148,10 +148,10 @@ def task_registry(self): intersection = set(default_tasks_registry.keys()).intersection(set(custom_tasks_registry.keys())) if len(intersection) > 0: logger.warning( - f"Following tasks ({intersection}) exists both in the default and custom tasks. Will use the default ones on conflict." + f"Following tasks ({intersection}) exists both in the default and custom tasks. Will use the custom ones on conflict." ) - # Defaults tasks should overwrite custom tasks + # Custom tasks overwrite defaults tasks return {**default_tasks_registry, **custom_tasks_registry} @property From 8568e72566305a9fdceef52b72b5e208f1c9401e Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Sat, 21 Dec 2024 06:47:54 +0100 Subject: [PATCH 7/7] Checkout PR merge commit for CI tests (#468) --- .github/workflows/tests.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index c0b06d36..950a7597 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -18,7 +18,6 @@ jobs: uses: actions/checkout@v3 with: lfs: 'true' - ref: ${{ github.event.pull_request.head.sha }} # we want to test against our branch not against a merge commit - name: Setup Python environment uses: actions/setup-python@v4 with: