Skip to content

Commit

Permalink
Merge branch 'main' into adding_inference_endpoints
Browse files Browse the repository at this point in the history
  • Loading branch information
clefourrier committed Feb 6, 2024
2 parents e74ae0a + 059e100 commit 203692a
Show file tree
Hide file tree
Showing 8 changed files with 402 additions and 77 deletions.
32 changes: 23 additions & 9 deletions src/lighteval/logging/evaluation_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@


class EnhancedJSONEncoder(json.JSONEncoder):
"""Provides a proper json encoding for the loggers and trackers json dumps.
"""
Provides a proper json encoding for the loggers and trackers json dumps.
Notably manages the json encoding of dataclasses.
"""

Expand All @@ -39,10 +40,16 @@ def default(self, o):


class EvaluationTracker:
"""Keeps track of the overall evaluation process and relevant informations.
The [`EvaluationTracker`] contains specific loggers for experiments details ([`DetailsLogger`]), metrics ([`MetricsLogger`]), task versions ([`VersionsLogger`]) as well as for the general configurations of both the specific task ([`TaskConfigLogger`]) and overall evaluation run ([`GeneralConfigLogger`]).
It compiles the data from these loggers and writes it to files, which can be published to the Hugging Face hub if requested.
"""
Keeps track of the overall evaluation process and relevant informations.
The [`EvaluationTracker`] contains specific loggers for experiments details
([`DetailsLogger`]), metrics ([`MetricsLogger`]), task versions
([`VersionsLogger`]) as well as for the general configurations of both the
specific task ([`TaskConfigLogger`]) and overall evaluation run
([`GeneralConfigLogger`]). It compiles the data from these loggers and
writes it to files, which can be published to the Hugging Face hub if
requested.
"""

details_logger: DetailsLogger
Expand All @@ -53,11 +60,15 @@ class EvaluationTracker:
hub_results_org: str

def __init__(self, hub_results_org: str = "", token: str = "") -> None:
"""Creates all the necessary loggers for evaluation tracking.
"""
Creates all the necessary loggers for evaluation tracking.
Args:
hub_results_org (str): The organisation to push the results to. See more details about the datasets organisation in [`EvaluationTracker.save`]
token (str): Token to use when pushing to the hub. This token should have write access to `hub_results_org`.
hub_results_org (str): The organisation to push the results to. See
more details about the datasets organisation in
[`EvaluationTracker.save`]
token (str): Token to use when pushing to the hub. This token should
have write access to `hub_results_org`.
"""
self.details_logger = DetailsLogger()
self.metrics_logger = MetricsLogger()
Expand All @@ -79,7 +90,8 @@ def save(
) -> None:
"""Saves the experiment information and results to files, and to the hub if requested.
Note: In case of save failure, this function will only print a warning, with the error message.
Note:
In case of save failure, this function will only print a warning, with the error message.
Args:
output_dir (str): Local folder path where you want results to be saved
Expand Down Expand Up @@ -204,6 +216,7 @@ def details_to_hub(
details_folder_path (str or Path): Local path of the current's experiment details folder.
The details folder (created by [`EvaluationTracker.save`]) should contain one parquet file per task used during the evaluation run of the current model.
push_as_public (bool, optional): If True, the results will be pushed publicly, else the datasets will be private.
"""
results_file_path = str(results_file_path)
details_folder_path = str(details_folder_path)
Expand Down Expand Up @@ -255,6 +268,7 @@ def recreate_metadata_card(self, repo_id: str, model_name: str = None) -> None:
Args:
repo_id (str): Details dataset repository path on the hub (`org/dataset`)
model_name (str): Name of the currently evaluated model.
"""
# Add a nice dataset card and the configuration YAML
files_in_repo = self.api.list_repo_files(repo_id=repo_id, repo_type="dataset")
Expand Down
49 changes: 44 additions & 5 deletions src/lighteval/logging/info_loggers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
import time
from dataclasses import asdict, dataclass, field
from typing import Union

import git
import numpy as np
Expand Down Expand Up @@ -38,7 +39,7 @@ class GeneralConfigLogger:
job_id (int): If the evaluation suite is launched as a slurm job, stores the current job id.
Purely informative parameter used to retrieve scheduler logs.
start_time (float): Start time of the experiment. Logged at class init.
end_time (float): Start time of the experiment. Logged when calling [`GeneralConfigLogger.log_end_time`]
end_time (float): End time of the experiment. Logged when calling [`GeneralConfigLogger.log_end_time`]
total_evaluation_time_secondes (str): Inferred total evaluation time in seconds (from the start and end times).
model_name (str): Name of the currently evaluated model.
model_sha (str): Commit hash of the currently evaluated model on the hub if available.
Expand Down Expand Up @@ -72,14 +73,44 @@ def __init__(self) -> None:
self.lighteval_sha = repo.git.rev_parse("HEAD")
self.start_time = time.perf_counter()

def log_args_info(self, num_fewshot_seeds, override_batch_size, max_samples, job_id, config=None) -> None:
def log_args_info(
self,
num_fewshot_seeds: int,
override_batch_size: Union[None, int],
max_samples: Union[None, int],
job_id: str,
config: "BrrrConfig" = None,
) -> None:
"""
Logs the information about the arguments passed to the method.
Args:
num_fewshot_seeds (int): number of few-shot seeds.
override_batch_size (Union[None, int]): overridden batch size.
If strictly positive, its value is used as the batch size for all experiments.
Else, the batch size is automatically inferred depending on what fits in memory.
max_samples (Union[None, int]): maximum number of samples, if None, use all the samples available.
job_id (str): job ID, used to retrieve logs.
config (optional): BrrrConfig
Returns:
None
"""
self.num_fewshot_seeds = num_fewshot_seeds
self.override_batch_size = override_batch_size
self.max_samples = max_samples
self.job_id = job_id
self.config = config

def log_model_info(self, model_info: ModelInfo) -> None:
"""
Logs the model information.
Args:
model_info (ModelInfo): Model information to be logged.
"""
self.model_name = model_info.model_name
self.model_sha = model_info.model_sha
self.model_dtype = model_info.model_dtype
Expand All @@ -102,6 +133,7 @@ class DetailsLogger:
Example: winogrande: [sample1_details, sample2_details, ...]
compiled_details (dict[str, `CompiledDetail`]): : Maps each task name to the list of its samples' compiled details.
compiled_details_over_all_tasks (CompiledDetailOverAllTasks): Aggregated details over all the tasks.
"""

@dataclass()
Expand Down Expand Up @@ -129,6 +161,7 @@ class Detail:
choices (list): List of the possible choices (for multichoice/loglikelihood evaluations)
gold_index (list): Indices of the gold targets among the [`choices`]
metrics (dict): Metric name to current example score
"""

example: str = ""
Expand Down Expand Up @@ -160,9 +193,10 @@ class CompiledDetail:
padded (int): Total umber of samples which needed padding during the batching step for the current task.
non_padded (int): Total number of samples which did not need padding during the batching step for the current task.
effective_few_shots (float): Average effective few shots across all samples for the current task.
The effective few shot is the number of few shots actually used to fit the prompt in the model context
effective few shot is the number of few shots actually used to fit the prompt in the model context
length while allowing model generation of the expected size.
num_truncated_few_shots (int): Total number of samples which required truncated prompts to fit the model size for the current task.
"""

hashes: dict = field(default_factory=dict)
Expand All @@ -186,9 +220,10 @@ class CompiledDetailOverAllTasks:
padded (int): Number of samples which needed padding during the batching step across all tasks.
non_padded (int): Number of samples which did not need padding during the batching step across all tasks.
effective_few_shots (float): Average effective few shots across all samples across all tasks.
The effective few shot is the number of few shots actually used to fit the prompt in the model context
effective few shot is the number of few shots actually used to fit the prompt in the model context
length while allowing model generation of the expected size.
num_truncated_few_shots (int): Number of samples which required truncated prompts to fit the model size across all tasks.
"""

hashes: dict = field(default_factory=dict)
Expand Down Expand Up @@ -388,7 +423,8 @@ def aggregate(self, task_dict: dict[str, LightevalTask], bootstrap_iters: int =
Args:
task_dict (dict[str, LightevalTask]): used to determine what aggregation function to use for each metric
bootstrap_iters (int, optional): _description_. Defaults to 1000.
bootstrap_iters (int, optional): Number of runs used to run the statistical bootstrap. Defaults to 1000.
"""

for task_name, metrics in self.metrics_values.items():
Expand Down Expand Up @@ -440,6 +476,7 @@ class VersionsLogger:
Attributes:
version (dict[str, int]): Maps the task names with the task versions.
"""

# the versions dict will be a dict of task_name: task_version
Expand All @@ -455,6 +492,7 @@ class TaskConfigLogger:
Attributes:
tasks_config (dict[str, TaskConfig]): Maps each task to its associated [`TaskConfig`]
"""

@dataclass
Expand All @@ -479,6 +517,7 @@ class TaskConfig:
truncated_num_docs (bool): Whether less than the total number of documents were used
output_regex (str)
frozen (bool)
"""

name: str
Expand Down
2 changes: 1 addition & 1 deletion src/lighteval/metrics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -501,7 +501,7 @@ def higher_is_better():
return res

@staticmethod
def corpus_level_fns():
def corpus_level_fns() -> dict[str, callable]:
res = {}
for metric in Metrics:
if metric.value.category == MetricCategory.IGNORED:
Expand Down
85 changes: 75 additions & 10 deletions src/lighteval/models/model_config.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from argparse import Namespace
from dataclasses import dataclass
from typing import Optional, Union

import torch
from transformers import AutoConfig, BitsAndBytesConfig, GPTQConfig
from transformers import AutoConfig, BitsAndBytesConfig, GPTQConfig, PretrainedConfig

from lighteval.logging.hierarchical_logger import hlog
from lighteval.models.utils import _get_model_sha
Expand All @@ -23,15 +24,20 @@

@dataclass
class EnvConfig:
"""
Configuration class for environment settings.
Attributes:
cache_dir (str): directory for caching data.
token (str): authentication token used for accessing the HuggingFace Hub.
"""

cache_dir: str = None
token: str = None


@dataclass
class BaseModelConfig:
"""Args:
pretrained (str):
The HuggingFace Hub model ID name or the path to a pre-trained
HuggingFace Hub model ID name or the path to a pre-trained
model to load. This is effectively the `pretrained_model_name_or_path`
argument of `from_pretrained` in the HuggingFace `transformers` API.
add_special_tokens (bool, optional, defaults to True):
Expand All @@ -47,7 +53,51 @@ class BaseModelConfig:
dtype (Union[str, torch.dtype], optional, defaults to None):):
Converts the model weights to `dtype`, if specified. Strings get
converted to `torch.dtype` objects (e.g. `float16` -> `torch.float16`).
Use `dtype="auto"` to derive the type from the model’s weights.
Use `dtype="auto"` to derive the type from the model's weights.
"""


@dataclass
class BaseModelConfig:
"""
Base configuration class for models.
Attributes:
pretrained (str): HuggingFace Hub model ID name or the path to a
pre-trained model to load. This is effectively the
`pretrained_model_name_or_path` argument of `from_pretrained` in the
HuggingFace `transformers` API.
accelerator (Accelerator): accelerator to use for model training.
tokenizer (Optional[str]): HuggingFace Hub tokenizer ID that will be
used for tokenization.
multichoice_continuations_start_space (Optional[bool]): Whether to add a
space at the start of each continuation in multichoice generation.
For example, context: "What is the capital of France?" and choices: "Paris", "London".
Will be tokenized as: "What is the capital of France? Paris" and "What is the capital of France? London".
subfolder (Optional[str]): The subfolder within the model repository.
revision (str): The revision of the model.
batch_size (int): The batch size for model training.
max_gen_toks (Optional[int]): The maximum number of tokens to generate.
max_length (Optional[int]): The maximum length of the generated output.
add_special_tokens (bool, optional, defaults to True): Whether to add special tokens to the input sequences.
If `None`, the default value will be set to `True` for seq2seq models (e.g. T5) and
`False` for causal models.
model_parallel (Optional[bool]): Whether to use model parallelism.
dtype (Optional[Union[str, torch.dtype]]): data type of the model.
device (Union[int, str]): device to use for model training.
quantization_config (Optional[BitsAndBytesConfig]): quantization
configuration for the model. Needed for 4-bit and 8-bit precision.
load_in_8bit (bool): Whether to load the model in 8-bit precision.
load_in_4bit (bool): Whether to load the model in 4-bit precision.
trust_remote_code (bool): Whether to trust remote code during model
loading.
Methods:
__post_init__(): Performs post-initialization checks on the configuration.
_init_configs(model_name, env_config): Initializes the model configuration.
init_configs(env_config): Initializes the model configuration using the environment configuration.
get_model_sha(): Retrieves the SHA of the model.
"""

pretrained: str
Expand Down Expand Up @@ -77,7 +127,7 @@ def __post_init__(self):
if not isinstance(self.device, str):
raise ValueError("Current device must be passed as string.")

def _init_configs(self, model_name, env_config: EnvConfig):
def _init_configs(self, model_name: str, env_config: EnvConfig) -> PretrainedConfig:
revision = self.revision
if self.subfolder:
revision = f"{self.revision}/{self.subfolder}"
Expand All @@ -98,7 +148,7 @@ def _init_configs(self, model_name, env_config: EnvConfig):

return auto_config

def init_configs(self, env_config: EnvConfig):
def init_configs(self, env_config: EnvConfig) -> PretrainedConfig:
return self._init_configs(self.pretrained, env_config=env_config)

def get_model_sha(self):
Expand Down Expand Up @@ -165,8 +215,23 @@ class InferenceEndpointModelConfig:
should_reuse_existing: bool = False


def create_model_config(args, accelerator: Accelerator): # noqa C901
# Incompatible models
def create_model_config(args: Namespace, accelerator: Union["Accelerator", None]) -> BaseModelConfig: # noqa: C901
"""
Create a model configuration based on the provided arguments.
Args:
args (Namespace): command-line arguments.
accelerator (Union[Accelerator, None]): accelerator to use for model training.
Returns:
BaseModelConfig: model configuration.
Raises:
ValueError: If both an inference server address and model arguments are provided.
ValueError: If multichoice continuations both should start with a space and should not start with a space.
ValueError: If a base model is not specified when using delta weights or adapter weights.
ValueError: If a base model is specified when not using delta weights or adapter weights.
"""
if args.inference_server_address is not None and args.model_args is not None:
raise ValueError("You cannot both use an inference server and load a model from its checkpoint.")
if args.inference_server_address is not None and args.endpoint_model_name is not None:
Expand Down
Loading

0 comments on commit 203692a

Please sign in to comment.