Skip to content

Commit

Permalink
Merge branch 'main' into add-custom-model
Browse files Browse the repository at this point in the history
  • Loading branch information
JoelNiklaus authored Dec 23, 2024
2 parents cfd7254 + 8568e72 commit 3ddc104
Show file tree
Hide file tree
Showing 10 changed files with 46 additions and 48 deletions.
5 changes: 2 additions & 3 deletions .github/workflows/tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ jobs:
uses: actions/checkout@v3
with:
lfs: 'true'
ref: ${{ github.event.pull_request.head.sha }} # we want to test against our branch not against a merge commit
- name: Setup Python environment
uses: actions/setup-python@v4
with:
Expand All @@ -28,7 +27,7 @@ jobs:
run: |
pip install -e .[dev,extended_tasks,multilingual]
- name: Get cached files
uses: actions/cache@v2
uses: actions/cache@v4
id: get-cache
with:
path: "cache"
Expand All @@ -41,7 +40,7 @@ jobs:
run: | # PYTHONPATH="${PYTHONPATH}:src" HF_DATASETS_CACHE="cache/datasets" HF_HOME="cache/models"
python -m pytest --disable-pytest-warnings
- name: Write cache
uses: actions/cache@v2
uses: actions/cache@v4
with:
path: "cache"
key: test-cache-HF
1 change: 0 additions & 1 deletion .github/workflows/trufflehog.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,3 @@ jobs:
fetch-depth: 0
- name: Secret Scanning
uses: trufflesecurity/trufflehog@main

7 changes: 2 additions & 5 deletions community_tasks/arabic_evals.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,6 @@ def arabic_mmlu_pfn(line, task_name: str = None):
choices=valid_keys_arabic, # Return only valid choices (Arabic keys)
gold_index=answer_index, # Correct index in the valid Arabic keys
instruction=instruction,
target_for_fewshot_sorting=valid_keys_arabic[answer_index], # Correct answer in Arabic form
)


Expand Down Expand Up @@ -149,7 +148,6 @@ def arabic_mmlu_ht_pfn(line, task_name: str = None):
choices=[str(i) for i in range(1, len(choices) + 1)], # List of strings instead of ints
gold_index=answer_index,
instruction=instruction,
target_for_fewshot_sorting=str(answer_index), # Assuming it's sorted based on the number
)


Expand Down Expand Up @@ -328,7 +326,6 @@ def aratrust_pfn(line, task_name: str = None):
choices=LETTER_INDICES_AR[:3],
gold_index=answer_index,
instruction=instruction,
target_for_fewshot_sorting=LETTER_INDICES_AR[answer_index],
)


Expand Down Expand Up @@ -413,7 +410,8 @@ def arabic_exams_pfn(line, task_name: str = None):
def alghafa_pfn(line, task_name: str = None):
question = line["query"]
answer_index = int(line["label"])
choices = [line[key] for key in ["sol1", "sol2", "sol3", "sol4"]]
allowed_keys = [f"sol{i}" for i in range(1, 6)]
choices = [line[key] for key in allowed_keys if key in line]

instruction = "الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح\n\n"
query = f"{instruction}السؤال: {question}\n"
Expand Down Expand Up @@ -802,7 +800,6 @@ def madinah_qa_pfn(line, task_name: str = None):
choices=choices,
gold_index=answer_index, # Correct index in the valid keys
instruction=instruction,
target_for_fewshot_sorting=valid_keys_latin[answer_index], # Correct answer in Latin form
)


Expand Down
1 change: 0 additions & 1 deletion docs/source/adding-a-new-metric.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -92,4 +92,3 @@ if __name__ == "__main__":

You can then give your custom metric to lighteval by using `--custom-tasks
path_to_your_file` when launching it.

12 changes: 6 additions & 6 deletions docs/source/contributing-to-multilingual-evaluations.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ We welcome translations in your language!

To contribute, you'll need to
1. Open the [translation_literals](https://github.com/huggingface/lighteval/blob/main/src/lighteval/tasks/templates/utils/translation_literals.py) file
2. Edit the file to add or expand the literal for your language of interest.
2. Edit the file to add or expand the literal for your language of interest.

```python
Language.ENGLISH: TranslationLiterals(
Expand Down Expand Up @@ -42,7 +42,7 @@ To contribute, you'll need to

## Contributing a new multilingual task

You should first read our guide on [adding a custom task](adding-a-custom-task), to better understand the different parameters we use.
You should first read our guide on [adding a custom task](adding-a-custom-task), to better understand the different parameters we use.

Then, you should take a look at the current [multilingual tasks](https://github.com/huggingface/lighteval/blob/main/src/lighteval/tasks/multilingual/tasks.py) file, to understand how they are defined. For multilingual evaluations the `prompt_function` should be implemented by language-adapted template. The template will take care of correct formatting, correct and consistent usage of language adjusted prompt anchors (e.g Question/Answer) and punctuation.

Expand All @@ -58,7 +58,7 @@ your_tasks = [
LightevalTaskConfig(
# Name of your evaluation
name=f"evalname_{language.value}_{formulation.name.lower()}",
# The evaluation is community contributed
# The evaluation is community contributed
suite=["community"],
# This will automatically get the correct metrics for your chosen formulation
metric=get_metrics_for_formulation(
Expand All @@ -72,7 +72,7 @@ your_tasks = [
# In this function, you choose which template to follow and for which language and formulation
prompt_function=get_template_prompt_function(
language=language,
# then use the adapter to define the mapping between the
# then use the adapter to define the mapping between the
# keys of the template (left), and the keys of your dataset
# (right)
# To know which template keys are required and available,
Expand All @@ -83,9 +83,9 @@ your_tasks = [
},
formulation=formulation,
),
# You can also add specific filters to remove irrelevant samples
# You can also add specific filters to remove irrelevant samples
hf_filter=lambda line: line["label"] in <condition>,
# You then select your huggingface dataset as well as
# You then select your huggingface dataset as well as
# the splits available for evaluation
hf_repo=<dataset>,
hf_subset=<subset>,
Expand Down
5 changes: 4 additions & 1 deletion docs/source/package_reference/logging.mdx
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
# Loggers
# Logging

## EvaluationTracker
[[autodoc]] logging.evaluation_tracker.EvaluationTracker

## GeneralConfigLogger
[[autodoc]] logging.info_loggers.GeneralConfigLogger
Expand Down
2 changes: 1 addition & 1 deletion docs/source/using-the-python-api.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def main():
env_config=EnvConfig(cache_dir="tmp/"),
# Remove the 2 parameters below once your configuration is tested
override_batch_size=1,
max_samples=10
max_samples=10
)

model_config = VLLMModelConfig(
Expand Down
55 changes: 28 additions & 27 deletions src/lighteval/logging/evaluation_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import copy
import json
import logging
import os
Expand Down Expand Up @@ -82,16 +81,35 @@ def default(self, o):


class EvaluationTracker:
"""
Keeps track of the overall evaluation process and relevant informations.
"""Keeps track of the overall evaluation process and relevant information.
The [`EvaluationTracker`] contains specific loggers for experiments details
([`DetailsLogger`]), metrics ([`MetricsLogger`]), task versions
([`VersionsLogger`]) as well as for the general configurations of both the
specific task ([`TaskConfigLogger`]) and overall evaluation run
([`GeneralConfigLogger`]). It compiles the data from these loggers and
The [`~logging.evaluation_tracker.EvaluationTracker`] contains specific loggers for experiments details
([`~logging.evaluation_tracker.DetailsLogger`]), metrics ([`~logging.evaluation_tracker.MetricsLogger`]), task versions
([`~logging.evaluation_tracker.VersionsLogger`]) as well as for the general configurations of both the
specific task ([`~logging.evaluation_tracker.TaskConfigLogger`]) and overall evaluation run
([`~logging.evaluation_tracker.GeneralConfigLogger`]). It compiles the data from these loggers and
writes it to files, which can be published to the Hugging Face hub if
requested.
Args:
output_dir (`str`): Local folder path where you want results to be saved.
save_details (`bool`, defaults to True): If True, details are saved to the `output_dir`.
push_to_hub (`bool`, defaults to False): If True, details are pushed to the hub.
Results are pushed to `{hub_results_org}/details__{sanitized model_name}` for the model `model_name`, a public dataset,
if `public` is True else `{hub_results_org}/details__{sanitized model_name}_private`, a private dataset.
push_to_tensorboard (`bool`, defaults to False): If True, will create and push the results for a tensorboard folder on the hub.
hub_results_org (`str`, *optional*): The organisation to push the results to.
See more details about the datasets organisation in [`EvaluationTracker.save`].
tensorboard_metric_prefix (`str`, defaults to "eval"): Prefix for the metrics in the tensorboard logs.
public (`bool`, defaults to False): If True, results and details are pushed to public orgs.
nanotron_run_info ([`~nanotron.config.GeneralArgs`], *optional*): Reference to information about Nanotron models runs.
**Attributes**:
- **details_logger** ([`~logging.info_loggers.DetailsLogger`]) -- Logger for experiment details.
- **metrics_logger** ([`~logging.info_loggers.MetricsLogger`]) -- Logger for experiment metrics.
- **versions_logger** ([`~logging.info_loggers.VersionsLogger`]) -- Logger for task versions.
- **general_config_logger** ([`~logging.info_loggers.GeneralConfigLogger`]) -- Logger for general configuration.
- **task_config_logger** ([`~logging.info_loggers.TaskConfigLogger`]) -- Logger for task configuration.
"""

def __init__(
Expand All @@ -105,23 +123,7 @@ def __init__(
public: bool = False,
nanotron_run_info: "GeneralArgs" = None,
) -> None:
"""
Creates all the necessary loggers for evaluation tracking.
Args:
output_dir (str): Local folder path where you want results to be saved
save_details (bool): If True, details are saved to the output_dir
push_to_hub (bool): If True, details are pushed to the hub.
Results are pushed to `{hub_results_org}/details__{sanitized model_name}` for the model `model_name`, a public dataset,
if `public` is True else `{hub_results_org}/details__{sanitized model_name}_private`, a private dataset.
push_results_to_tensorboard (bool): If True, will create and push the results for a tensorboard folder on the hub
hub_results_org (str): The organisation to push the results to. See
more details about the datasets organisation in
[`EvaluationTracker.save`]
tensorboard_metric_prefix (str): Prefix for the metrics in the tensorboard logs
public (bool): If True, results and details are pushed in private orgs
nanotron_run_info (GeneralArgs): Reference to informations about Nanotron models runs
"""
"""Creates all the necessary loggers for evaluation tracking."""
self.details_logger = DetailsLogger()
self.metrics_logger = MetricsLogger()
self.versions_logger = VersionsLogger()
Expand Down Expand Up @@ -153,8 +155,7 @@ def save(self) -> None:
date_id = datetime.now().isoformat().replace(":", "-")

# We first prepare data to save
config_general = copy.deepcopy(self.general_config_logger)
config_general = asdict(config_general)
config_general = asdict(self.general_config_logger)
# We remove the config from logging, which contains context/accelerator objects
config_general.pop("config")

Expand Down
2 changes: 1 addition & 1 deletion src/lighteval/main_accelerate.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
logger = logging.getLogger(__name__)

TOKEN = os.getenv("HF_TOKEN")
CACHE_DIR: str = os.getenv("HF_HOME", "/scratch")
CACHE_DIR: str = os.getenv("HF_HOME")

HELP_PANEL_NAME_1 = "Common Parameters"
HELP_PANEL_NAME_2 = "Logging Parameters"
Expand Down
4 changes: 2 additions & 2 deletions src/lighteval/tasks/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,10 +148,10 @@ def task_registry(self):
intersection = set(default_tasks_registry.keys()).intersection(set(custom_tasks_registry.keys()))
if len(intersection) > 0:
logger.warning(
f"Following tasks ({intersection}) exists both in the default and custom tasks. Will use the default ones on conflict."
f"Following tasks ({intersection}) exists both in the default and custom tasks. Will use the custom ones on conflict."
)

# Defaults tasks should overwrite custom tasks
# Custom tasks overwrite defaults tasks
return {**default_tasks_registry, **custom_tasks_registry}

@property
Expand Down

0 comments on commit 3ddc104

Please sign in to comment.