diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index 7b9805084..950a7597a 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -18,7 +18,6 @@ jobs: uses: actions/checkout@v3 with: lfs: 'true' - ref: ${{ github.event.pull_request.head.sha }} # we want to test against our branch not against a merge commit - name: Setup Python environment uses: actions/setup-python@v4 with: @@ -28,7 +27,7 @@ jobs: run: | pip install -e .[dev,extended_tasks,multilingual] - name: Get cached files - uses: actions/cache@v2 + uses: actions/cache@v4 id: get-cache with: path: "cache" @@ -41,7 +40,7 @@ jobs: run: | # PYTHONPATH="${PYTHONPATH}:src" HF_DATASETS_CACHE="cache/datasets" HF_HOME="cache/models" python -m pytest --disable-pytest-warnings - name: Write cache - uses: actions/cache@v2 + uses: actions/cache@v4 with: path: "cache" key: test-cache-HF diff --git a/.github/workflows/trufflehog.yml b/.github/workflows/trufflehog.yml index 8ac08ad65..ecdca01de 100644 --- a/.github/workflows/trufflehog.yml +++ b/.github/workflows/trufflehog.yml @@ -16,4 +16,3 @@ jobs: fetch-depth: 0 - name: Secret Scanning uses: trufflesecurity/trufflehog@main - diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py index 4408f22fa..86ab69e28 100644 --- a/community_tasks/arabic_evals.py +++ b/community_tasks/arabic_evals.py @@ -86,7 +86,6 @@ def arabic_mmlu_pfn(line, task_name: str = None): choices=valid_keys_arabic, # Return only valid choices (Arabic keys) gold_index=answer_index, # Correct index in the valid Arabic keys instruction=instruction, - target_for_fewshot_sorting=valid_keys_arabic[answer_index], # Correct answer in Arabic form ) @@ -149,7 +148,6 @@ def arabic_mmlu_ht_pfn(line, task_name: str = None): choices=[str(i) for i in range(1, len(choices) + 1)], # List of strings instead of ints gold_index=answer_index, instruction=instruction, - target_for_fewshot_sorting=str(answer_index), # Assuming it's sorted based on the number ) @@ -328,7 +326,6 @@ def aratrust_pfn(line, task_name: str = None): choices=LETTER_INDICES_AR[:3], gold_index=answer_index, instruction=instruction, - target_for_fewshot_sorting=LETTER_INDICES_AR[answer_index], ) @@ -413,7 +410,8 @@ def arabic_exams_pfn(line, task_name: str = None): def alghafa_pfn(line, task_name: str = None): question = line["query"] answer_index = int(line["label"]) - choices = [line[key] for key in ["sol1", "sol2", "sol3", "sol4"]] + allowed_keys = [f"sol{i}" for i in range(1, 6)] + choices = [line[key] for key in allowed_keys if key in line] instruction = "الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح\n\n" query = f"{instruction}السؤال: {question}\n" @@ -802,7 +800,6 @@ def madinah_qa_pfn(line, task_name: str = None): choices=choices, gold_index=answer_index, # Correct index in the valid keys instruction=instruction, - target_for_fewshot_sorting=valid_keys_latin[answer_index], # Correct answer in Latin form ) diff --git a/docs/source/adding-a-new-metric.mdx b/docs/source/adding-a-new-metric.mdx index 35fc975f8..6433d5883 100644 --- a/docs/source/adding-a-new-metric.mdx +++ b/docs/source/adding-a-new-metric.mdx @@ -92,4 +92,3 @@ if __name__ == "__main__": You can then give your custom metric to lighteval by using `--custom-tasks path_to_your_file` when launching it. - diff --git a/docs/source/contributing-to-multilingual-evaluations.mdx b/docs/source/contributing-to-multilingual-evaluations.mdx index 0d0855d75..4db1c935b 100644 --- a/docs/source/contributing-to-multilingual-evaluations.mdx +++ b/docs/source/contributing-to-multilingual-evaluations.mdx @@ -8,7 +8,7 @@ We welcome translations in your language! To contribute, you'll need to 1. Open the [translation_literals](https://github.com/huggingface/lighteval/blob/main/src/lighteval/tasks/templates/utils/translation_literals.py) file -2. Edit the file to add or expand the literal for your language of interest. +2. Edit the file to add or expand the literal for your language of interest. ```python Language.ENGLISH: TranslationLiterals( @@ -42,7 +42,7 @@ To contribute, you'll need to ## Contributing a new multilingual task -You should first read our guide on [adding a custom task](adding-a-custom-task), to better understand the different parameters we use. +You should first read our guide on [adding a custom task](adding-a-custom-task), to better understand the different parameters we use. Then, you should take a look at the current [multilingual tasks](https://github.com/huggingface/lighteval/blob/main/src/lighteval/tasks/multilingual/tasks.py) file, to understand how they are defined. For multilingual evaluations the `prompt_function` should be implemented by language-adapted template. The template will take care of correct formatting, correct and consistent usage of language adjusted prompt anchors (e.g Question/Answer) and punctuation. @@ -58,7 +58,7 @@ your_tasks = [ LightevalTaskConfig( # Name of your evaluation name=f"evalname_{language.value}_{formulation.name.lower()}", - # The evaluation is community contributed + # The evaluation is community contributed suite=["community"], # This will automatically get the correct metrics for your chosen formulation metric=get_metrics_for_formulation( @@ -72,7 +72,7 @@ your_tasks = [ # In this function, you choose which template to follow and for which language and formulation prompt_function=get_template_prompt_function( language=language, - # then use the adapter to define the mapping between the + # then use the adapter to define the mapping between the # keys of the template (left), and the keys of your dataset # (right) # To know which template keys are required and available, @@ -83,9 +83,9 @@ your_tasks = [ }, formulation=formulation, ), - # You can also add specific filters to remove irrelevant samples + # You can also add specific filters to remove irrelevant samples hf_filter=lambda line: line["label"] in , - # You then select your huggingface dataset as well as + # You then select your huggingface dataset as well as # the splits available for evaluation hf_repo=, hf_subset=, diff --git a/docs/source/package_reference/logging.mdx b/docs/source/package_reference/logging.mdx index 9fd01154e..9102755c1 100644 --- a/docs/source/package_reference/logging.mdx +++ b/docs/source/package_reference/logging.mdx @@ -1,4 +1,7 @@ -# Loggers +# Logging + +## EvaluationTracker +[[autodoc]] logging.evaluation_tracker.EvaluationTracker ## GeneralConfigLogger [[autodoc]] logging.info_loggers.GeneralConfigLogger diff --git a/docs/source/using-the-python-api.mdx b/docs/source/using-the-python-api.mdx index 8c44050f4..583da5f54 100644 --- a/docs/source/using-the-python-api.mdx +++ b/docs/source/using-the-python-api.mdx @@ -35,7 +35,7 @@ def main(): env_config=EnvConfig(cache_dir="tmp/"), # Remove the 2 parameters below once your configuration is tested override_batch_size=1, - max_samples=10 + max_samples=10 ) model_config = VLLMModelConfig( diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py index 017055348..6cad9189f 100644 --- a/src/lighteval/logging/evaluation_tracker.py +++ b/src/lighteval/logging/evaluation_tracker.py @@ -20,7 +20,6 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -import copy import json import logging import os @@ -82,16 +81,35 @@ def default(self, o): class EvaluationTracker: - """ - Keeps track of the overall evaluation process and relevant informations. + """Keeps track of the overall evaluation process and relevant information. - The [`EvaluationTracker`] contains specific loggers for experiments details - ([`DetailsLogger`]), metrics ([`MetricsLogger`]), task versions - ([`VersionsLogger`]) as well as for the general configurations of both the - specific task ([`TaskConfigLogger`]) and overall evaluation run - ([`GeneralConfigLogger`]). It compiles the data from these loggers and + The [`~logging.evaluation_tracker.EvaluationTracker`] contains specific loggers for experiments details + ([`~logging.evaluation_tracker.DetailsLogger`]), metrics ([`~logging.evaluation_tracker.MetricsLogger`]), task versions + ([`~logging.evaluation_tracker.VersionsLogger`]) as well as for the general configurations of both the + specific task ([`~logging.evaluation_tracker.TaskConfigLogger`]) and overall evaluation run + ([`~logging.evaluation_tracker.GeneralConfigLogger`]). It compiles the data from these loggers and writes it to files, which can be published to the Hugging Face hub if requested. + + Args: + output_dir (`str`): Local folder path where you want results to be saved. + save_details (`bool`, defaults to True): If True, details are saved to the `output_dir`. + push_to_hub (`bool`, defaults to False): If True, details are pushed to the hub. + Results are pushed to `{hub_results_org}/details__{sanitized model_name}` for the model `model_name`, a public dataset, + if `public` is True else `{hub_results_org}/details__{sanitized model_name}_private`, a private dataset. + push_to_tensorboard (`bool`, defaults to False): If True, will create and push the results for a tensorboard folder on the hub. + hub_results_org (`str`, *optional*): The organisation to push the results to. + See more details about the datasets organisation in [`EvaluationTracker.save`]. + tensorboard_metric_prefix (`str`, defaults to "eval"): Prefix for the metrics in the tensorboard logs. + public (`bool`, defaults to False): If True, results and details are pushed to public orgs. + nanotron_run_info ([`~nanotron.config.GeneralArgs`], *optional*): Reference to information about Nanotron models runs. + + **Attributes**: + - **details_logger** ([`~logging.info_loggers.DetailsLogger`]) -- Logger for experiment details. + - **metrics_logger** ([`~logging.info_loggers.MetricsLogger`]) -- Logger for experiment metrics. + - **versions_logger** ([`~logging.info_loggers.VersionsLogger`]) -- Logger for task versions. + - **general_config_logger** ([`~logging.info_loggers.GeneralConfigLogger`]) -- Logger for general configuration. + - **task_config_logger** ([`~logging.info_loggers.TaskConfigLogger`]) -- Logger for task configuration. """ def __init__( @@ -105,23 +123,7 @@ def __init__( public: bool = False, nanotron_run_info: "GeneralArgs" = None, ) -> None: - """ - Creates all the necessary loggers for evaluation tracking. - - Args: - output_dir (str): Local folder path where you want results to be saved - save_details (bool): If True, details are saved to the output_dir - push_to_hub (bool): If True, details are pushed to the hub. - Results are pushed to `{hub_results_org}/details__{sanitized model_name}` for the model `model_name`, a public dataset, - if `public` is True else `{hub_results_org}/details__{sanitized model_name}_private`, a private dataset. - push_results_to_tensorboard (bool): If True, will create and push the results for a tensorboard folder on the hub - hub_results_org (str): The organisation to push the results to. See - more details about the datasets organisation in - [`EvaluationTracker.save`] - tensorboard_metric_prefix (str): Prefix for the metrics in the tensorboard logs - public (bool): If True, results and details are pushed in private orgs - nanotron_run_info (GeneralArgs): Reference to informations about Nanotron models runs - """ + """Creates all the necessary loggers for evaluation tracking.""" self.details_logger = DetailsLogger() self.metrics_logger = MetricsLogger() self.versions_logger = VersionsLogger() @@ -153,8 +155,7 @@ def save(self) -> None: date_id = datetime.now().isoformat().replace(":", "-") # We first prepare data to save - config_general = copy.deepcopy(self.general_config_logger) - config_general = asdict(config_general) + config_general = asdict(self.general_config_logger) # We remove the config from logging, which contains context/accelerator objects config_general.pop("config") diff --git a/src/lighteval/main_accelerate.py b/src/lighteval/main_accelerate.py index a0a01abe8..2dd78f445 100644 --- a/src/lighteval/main_accelerate.py +++ b/src/lighteval/main_accelerate.py @@ -31,7 +31,7 @@ logger = logging.getLogger(__name__) TOKEN = os.getenv("HF_TOKEN") -CACHE_DIR: str = os.getenv("HF_HOME", "/scratch") +CACHE_DIR: str = os.getenv("HF_HOME") HELP_PANEL_NAME_1 = "Common Parameters" HELP_PANEL_NAME_2 = "Logging Parameters" diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py index 69532c095..834e81706 100644 --- a/src/lighteval/tasks/registry.py +++ b/src/lighteval/tasks/registry.py @@ -148,10 +148,10 @@ def task_registry(self): intersection = set(default_tasks_registry.keys()).intersection(set(custom_tasks_registry.keys())) if len(intersection) > 0: logger.warning( - f"Following tasks ({intersection}) exists both in the default and custom tasks. Will use the default ones on conflict." + f"Following tasks ({intersection}) exists both in the default and custom tasks. Will use the custom ones on conflict." ) - # Defaults tasks should overwrite custom tasks + # Custom tasks overwrite defaults tasks return {**default_tasks_registry, **custom_tasks_registry} @property