diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py index 3d36d76c2..0515af461 100644 --- a/src/lighteval/logging/evaluation_tracker.py +++ b/src/lighteval/logging/evaluation_tracker.py @@ -28,7 +28,8 @@ class EnhancedJSONEncoder(json.JSONEncoder): - """Provides a proper json encoding for the loggers and trackers json dumps. + """ + Provides a proper json encoding for the loggers and trackers json dumps. Notably manages the json encoding of dataclasses. """ @@ -39,10 +40,16 @@ def default(self, o): class EvaluationTracker: - """Keeps track of the overall evaluation process and relevant informations. - - The [`EvaluationTracker`] contains specific loggers for experiments details ([`DetailsLogger`]), metrics ([`MetricsLogger`]), task versions ([`VersionsLogger`]) as well as for the general configurations of both the specific task ([`TaskConfigLogger`]) and overall evaluation run ([`GeneralConfigLogger`]). - It compiles the data from these loggers and writes it to files, which can be published to the Hugging Face hub if requested. + """ + Keeps track of the overall evaluation process and relevant informations. + + The [`EvaluationTracker`] contains specific loggers for experiments details + ([`DetailsLogger`]), metrics ([`MetricsLogger`]), task versions + ([`VersionsLogger`]) as well as for the general configurations of both the + specific task ([`TaskConfigLogger`]) and overall evaluation run + ([`GeneralConfigLogger`]). It compiles the data from these loggers and + writes it to files, which can be published to the Hugging Face hub if + requested. """ details_logger: DetailsLogger @@ -53,11 +60,15 @@ class EvaluationTracker: hub_results_org: str def __init__(self, hub_results_org: str = "", token: str = "") -> None: - """Creates all the necessary loggers for evaluation tracking. + """ + Creates all the necessary loggers for evaluation tracking. Args: - hub_results_org (str): The organisation to push the results to. See more details about the datasets organisation in [`EvaluationTracker.save`] - token (str): Token to use when pushing to the hub. This token should have write access to `hub_results_org`. + hub_results_org (str): The organisation to push the results to. See + more details about the datasets organisation in + [`EvaluationTracker.save`] + token (str): Token to use when pushing to the hub. This token should + have write access to `hub_results_org`. """ self.details_logger = DetailsLogger() self.metrics_logger = MetricsLogger() @@ -79,7 +90,8 @@ def save( ) -> None: """Saves the experiment information and results to files, and to the hub if requested. - Note: In case of save failure, this function will only print a warning, with the error message. + Note: + In case of save failure, this function will only print a warning, with the error message. Args: output_dir (str): Local folder path where you want results to be saved @@ -204,6 +216,7 @@ def details_to_hub( details_folder_path (str or Path): Local path of the current's experiment details folder. The details folder (created by [`EvaluationTracker.save`]) should contain one parquet file per task used during the evaluation run of the current model. push_as_public (bool, optional): If True, the results will be pushed publicly, else the datasets will be private. + """ results_file_path = str(results_file_path) details_folder_path = str(details_folder_path) @@ -255,6 +268,7 @@ def recreate_metadata_card(self, repo_id: str, model_name: str = None) -> None: Args: repo_id (str): Details dataset repository path on the hub (`org/dataset`) model_name (str): Name of the currently evaluated model. + """ # Add a nice dataset card and the configuration YAML files_in_repo = self.api.list_repo_files(repo_id=repo_id, repo_type="dataset") diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py index 9fb4249ee..38d4d7abe 100644 --- a/src/lighteval/logging/info_loggers.py +++ b/src/lighteval/logging/info_loggers.py @@ -2,6 +2,7 @@ import os import time from dataclasses import asdict, dataclass, field +from typing import Union import git import numpy as np @@ -38,7 +39,7 @@ class GeneralConfigLogger: job_id (int): If the evaluation suite is launched as a slurm job, stores the current job id. Purely informative parameter used to retrieve scheduler logs. start_time (float): Start time of the experiment. Logged at class init. - end_time (float): Start time of the experiment. Logged when calling [`GeneralConfigLogger.log_end_time`] + end_time (float): End time of the experiment. Logged when calling [`GeneralConfigLogger.log_end_time`] total_evaluation_time_secondes (str): Inferred total evaluation time in seconds (from the start and end times). model_name (str): Name of the currently evaluated model. model_sha (str): Commit hash of the currently evaluated model on the hub if available. @@ -72,7 +73,30 @@ def __init__(self) -> None: self.lighteval_sha = repo.git.rev_parse("HEAD") self.start_time = time.perf_counter() - def log_args_info(self, num_fewshot_seeds, override_batch_size, max_samples, job_id, config=None) -> None: + def log_args_info( + self, + num_fewshot_seeds: int, + override_batch_size: Union[None, int], + max_samples: Union[None, int], + job_id: str, + config: "BrrrConfig" = None, + ) -> None: + """ + Logs the information about the arguments passed to the method. + + Args: + num_fewshot_seeds (int): number of few-shot seeds. + override_batch_size (Union[None, int]): overridden batch size. + If strictly positive, its value is used as the batch size for all experiments. + Else, the batch size is automatically inferred depending on what fits in memory. + max_samples (Union[None, int]): maximum number of samples, if None, use all the samples available. + job_id (str): job ID, used to retrieve logs. + config (optional): BrrrConfig + + Returns: + None + + """ self.num_fewshot_seeds = num_fewshot_seeds self.override_batch_size = override_batch_size self.max_samples = max_samples @@ -80,6 +104,13 @@ def log_args_info(self, num_fewshot_seeds, override_batch_size, max_samples, job self.config = config def log_model_info(self, model_info: ModelInfo) -> None: + """ + Logs the model information. + + Args: + model_info (ModelInfo): Model information to be logged. + + """ self.model_name = model_info.model_name self.model_sha = model_info.model_sha self.model_dtype = model_info.model_dtype @@ -102,6 +133,7 @@ class DetailsLogger: Example: winogrande: [sample1_details, sample2_details, ...] compiled_details (dict[str, `CompiledDetail`]): : Maps each task name to the list of its samples' compiled details. compiled_details_over_all_tasks (CompiledDetailOverAllTasks): Aggregated details over all the tasks. + """ @dataclass() @@ -129,6 +161,7 @@ class Detail: choices (list): List of the possible choices (for multichoice/loglikelihood evaluations) gold_index (list): Indices of the gold targets among the [`choices`] metrics (dict): Metric name to current example score + """ example: str = "" @@ -160,9 +193,10 @@ class CompiledDetail: padded (int): Total umber of samples which needed padding during the batching step for the current task. non_padded (int): Total number of samples which did not need padding during the batching step for the current task. effective_few_shots (float): Average effective few shots across all samples for the current task. - The effective few shot is the number of few shots actually used to fit the prompt in the model context + effective few shot is the number of few shots actually used to fit the prompt in the model context length while allowing model generation of the expected size. num_truncated_few_shots (int): Total number of samples which required truncated prompts to fit the model size for the current task. + """ hashes: dict = field(default_factory=dict) @@ -186,9 +220,10 @@ class CompiledDetailOverAllTasks: padded (int): Number of samples which needed padding during the batching step across all tasks. non_padded (int): Number of samples which did not need padding during the batching step across all tasks. effective_few_shots (float): Average effective few shots across all samples across all tasks. - The effective few shot is the number of few shots actually used to fit the prompt in the model context + effective few shot is the number of few shots actually used to fit the prompt in the model context length while allowing model generation of the expected size. num_truncated_few_shots (int): Number of samples which required truncated prompts to fit the model size across all tasks. + """ hashes: dict = field(default_factory=dict) @@ -388,7 +423,8 @@ def aggregate(self, task_dict: dict[str, LightevalTask], bootstrap_iters: int = Args: task_dict (dict[str, LightevalTask]): used to determine what aggregation function to use for each metric - bootstrap_iters (int, optional): _description_. Defaults to 1000. + bootstrap_iters (int, optional): Number of runs used to run the statistical bootstrap. Defaults to 1000. + """ for task_name, metrics in self.metrics_values.items(): @@ -440,6 +476,7 @@ class VersionsLogger: Attributes: version (dict[str, int]): Maps the task names with the task versions. + """ # the versions dict will be a dict of task_name: task_version @@ -455,6 +492,7 @@ class TaskConfigLogger: Attributes: tasks_config (dict[str, TaskConfig]): Maps each task to its associated [`TaskConfig`] + """ @dataclass @@ -479,6 +517,7 @@ class TaskConfig: truncated_num_docs (bool): Whether less than the total number of documents were used output_regex (str) frozen (bool) + """ name: str diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py index df9af332a..318a4599f 100644 --- a/src/lighteval/metrics/metrics.py +++ b/src/lighteval/metrics/metrics.py @@ -501,7 +501,7 @@ def higher_is_better(): return res @staticmethod - def corpus_level_fns(): + def corpus_level_fns() -> dict[str, callable]: res = {} for metric in Metrics: if metric.value.category == MetricCategory.IGNORED: diff --git a/src/lighteval/models/model_config.py b/src/lighteval/models/model_config.py index 202b09d29..f1bbef43f 100644 --- a/src/lighteval/models/model_config.py +++ b/src/lighteval/models/model_config.py @@ -1,8 +1,9 @@ +from argparse import Namespace from dataclasses import dataclass from typing import Optional, Union import torch -from transformers import AutoConfig, BitsAndBytesConfig, GPTQConfig +from transformers import AutoConfig, BitsAndBytesConfig, GPTQConfig, PretrainedConfig from lighteval.logging.hierarchical_logger import hlog from lighteval.models.utils import _get_model_sha @@ -23,15 +24,20 @@ @dataclass class EnvConfig: + """ + Configuration class for environment settings. + + Attributes: + cache_dir (str): directory for caching data. + token (str): authentication token used for accessing the HuggingFace Hub. + """ + cache_dir: str = None token: str = None - -@dataclass -class BaseModelConfig: """Args: pretrained (str): - The HuggingFace Hub model ID name or the path to a pre-trained + HuggingFace Hub model ID name or the path to a pre-trained model to load. This is effectively the `pretrained_model_name_or_path` argument of `from_pretrained` in the HuggingFace `transformers` API. add_special_tokens (bool, optional, defaults to True): @@ -47,7 +53,51 @@ class BaseModelConfig: dtype (Union[str, torch.dtype], optional, defaults to None):): Converts the model weights to `dtype`, if specified. Strings get converted to `torch.dtype` objects (e.g. `float16` -> `torch.float16`). - Use `dtype="auto"` to derive the type from the model’s weights. + Use `dtype="auto"` to derive the type from the model's weights. + """ + + +@dataclass +class BaseModelConfig: + """ + Base configuration class for models. + + Attributes: + pretrained (str): HuggingFace Hub model ID name or the path to a + pre-trained model to load. This is effectively the + `pretrained_model_name_or_path` argument of `from_pretrained` in the + HuggingFace `transformers` API. + accelerator (Accelerator): accelerator to use for model training. + tokenizer (Optional[str]): HuggingFace Hub tokenizer ID that will be + used for tokenization. + multichoice_continuations_start_space (Optional[bool]): Whether to add a + space at the start of each continuation in multichoice generation. + For example, context: "What is the capital of France?" and choices: "Paris", "London". + Will be tokenized as: "What is the capital of France? Paris" and "What is the capital of France? London". + subfolder (Optional[str]): The subfolder within the model repository. + revision (str): The revision of the model. + batch_size (int): The batch size for model training. + max_gen_toks (Optional[int]): The maximum number of tokens to generate. + max_length (Optional[int]): The maximum length of the generated output. + add_special_tokens (bool, optional, defaults to True): Whether to add special tokens to the input sequences. + If `None`, the default value will be set to `True` for seq2seq models (e.g. T5) and + `False` for causal models. + model_parallel (Optional[bool]): Whether to use model parallelism. + dtype (Optional[Union[str, torch.dtype]]): data type of the model. + device (Union[int, str]): device to use for model training. + quantization_config (Optional[BitsAndBytesConfig]): quantization + configuration for the model. Needed for 4-bit and 8-bit precision. + load_in_8bit (bool): Whether to load the model in 8-bit precision. + load_in_4bit (bool): Whether to load the model in 4-bit precision. + trust_remote_code (bool): Whether to trust remote code during model + loading. + + Methods: + __post_init__(): Performs post-initialization checks on the configuration. + _init_configs(model_name, env_config): Initializes the model configuration. + init_configs(env_config): Initializes the model configuration using the environment configuration. + get_model_sha(): Retrieves the SHA of the model. + """ pretrained: str @@ -77,7 +127,7 @@ def __post_init__(self): if not isinstance(self.device, str): raise ValueError("Current device must be passed as string.") - def _init_configs(self, model_name, env_config: EnvConfig): + def _init_configs(self, model_name: str, env_config: EnvConfig) -> PretrainedConfig: revision = self.revision if self.subfolder: revision = f"{self.revision}/{self.subfolder}" @@ -98,7 +148,7 @@ def _init_configs(self, model_name, env_config: EnvConfig): return auto_config - def init_configs(self, env_config: EnvConfig): + def init_configs(self, env_config: EnvConfig) -> PretrainedConfig: return self._init_configs(self.pretrained, env_config=env_config) def get_model_sha(self): @@ -165,8 +215,23 @@ class InferenceEndpointModelConfig: should_reuse_existing: bool = False -def create_model_config(args, accelerator: Accelerator): # noqa C901 - # Incompatible models +def create_model_config(args: Namespace, accelerator: Union["Accelerator", None]) -> BaseModelConfig: # noqa: C901 + """ + Create a model configuration based on the provided arguments. + + Args: + args (Namespace): command-line arguments. + accelerator (Union[Accelerator, None]): accelerator to use for model training. + + Returns: + BaseModelConfig: model configuration. + + Raises: + ValueError: If both an inference server address and model arguments are provided. + ValueError: If multichoice continuations both should start with a space and should not start with a space. + ValueError: If a base model is not specified when using delta weights or adapter weights. + ValueError: If a base model is specified when not using delta weights or adapter weights. + """ if args.inference_server_address is not None and args.model_args is not None: raise ValueError("You cannot both use an inference server and load a model from its checkpoint.") if args.inference_server_address is not None and args.endpoint_model_name is not None: diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py index ff7197fe4..e16963a9c 100644 --- a/src/lighteval/tasks/lighteval_task.py +++ b/src/lighteval/tasks/lighteval_task.py @@ -40,7 +40,19 @@ class LightevalTask: - def __init__(self, name: str, cfg: dict, cache_dir: str = None, custom_tasks_module=None): + def __init__(self, name: str, cfg: dict, cache_dir: Optional[str] = None, custom_tasks_module=None): + """ + Initialize a LightEval task. + + Args: + name (str): name of the task. + cfg (dict): configuration dictionary containing + task-specific settings (from the task_table.json file). + cache_dir (Optional[str], optional): directory to cache the + dataset. Defaults to None. + custom_tasks_module ([type], optional): A custom module + containing task-specific functions. Defaults to None. + """ self.name = name self.VERSION = 0 self.is_main_process = False @@ -108,6 +120,17 @@ def cfg(self): return self._cfg def doc_to_text_without_instructions(self, doc: Doc) -> str: + """ + Returns the query of the document without the instructions. If the + document has instructions, it removes them from the query: + + Args: + doc (Doc): document class, containing the query and the + instructions. + + Returns: + str: Query of the document without the instructions. + """ if doc.instruction is not None: if not doc.query.startswith(doc.instruction): raise ValueError(f"Prompt query {doc.query} is not starting with instruction {doc.instruction}") @@ -115,6 +138,18 @@ def doc_to_text_without_instructions(self, doc: Doc) -> str: return doc.query def doc_to_text_and_instructions(self, doc: Doc) -> Tuple[str, str]: + """ + Returns a tuple with the query of the document and the instructions. + If the document has no instructions, the second element of the tuple is + an empty string. + + Args: + doc (Doc): document, containing the query and the instructions. + + Returns: + Tuple[str, str]: A tuple with the query of the document and the + instructions. + """ if doc.instruction is not None: if not doc.query.startswith(doc.instruction): raise ValueError(f"Prompt query {doc.query} is not starting with instruction {doc.instruction}") @@ -122,10 +157,17 @@ def doc_to_text_and_instructions(self, doc: Doc) -> Tuple[str, str]: return (doc.query, "") def get_first_possible_fewshot_splits(self, number_of_splits: int = 1) -> list[str]: - """Parses the possible fewshot split keys in order: - train, then validation keys - and matches them with the available keys. - Returns the first available. + """ + Parses the possible fewshot split keys in order: train, then validation + keys and matches them with the available keys. Returns the first + available. + + Args: + number_of_splits (int, optional): Number of splits to return. + Defaults to 1. + + Returns: + list[str]: List of the first available fewshot splits. """ # Possible few shot splits are the available splits not used for evaluation possible_fewshot_splits = [k for k in self.all_available_splits if k not in self.evaluation_split] @@ -144,13 +186,24 @@ def get_first_possible_fewshot_splits(self, number_of_splits: int = 1) -> list[s hlog_warn(f"Careful, the task {self.name} is using evaluation data to build the few shot examples.") return None - def _get_docs_from_split(self, keys, few_shots=False) -> list[Doc]: + def _get_docs_from_split(self, splits: list[str], few_shots=False) -> list[Doc]: + """ + Get the documents from the dataset for the given keys (splits). + + Args: + splits (list[str]): List of splits, (e.g. ["train", "dev"]) + few_shots (bool, optional): Whether the documents are used for few + shot examples. Defaults to False. + + Returns: + list[Doc]: List of documents. + """ if self.dataset is None: self.dataset = download_dataset_worker((self.dataset_path, self.dataset_config_name)) docs = [] - for key in keys: - for item in self.dataset[key]: + for split in splits: + for item in self.dataset[split]: # Some tasks formatting is applied differently when the document is used for fewshot examples # vs when it's used for the actual prompt. That's why we store whether we are currently using the # doc for a fewshot sample (few_shots=True) or not, which then leads to the creation of a different Doc. @@ -159,6 +212,14 @@ def _get_docs_from_split(self, keys, few_shots=False) -> list[Doc]: return docs def fewshot_docs(self) -> list[Doc]: + """ + Returns the few shot documents. If the few shot documents are not + available, it gets them from the few shot split or the evaluation split. + + Returns: + list[Doc]: Documents that will be used for few shot examples. One + document = one few shot example. + """ if self._fewshot_docs is None: self._fewshot_docs = [] @@ -170,11 +231,28 @@ def fewshot_docs(self) -> list[Doc]: return self._fewshot_docs def eval_docs(self) -> list[Doc]: + """ + Returns the evaluation documents. + + Returns: + list[Doc]: Evaluation documents. + """ if self._docs is None: self._docs = self._get_docs_from_split(self.evaluation_split) return self._docs - def doc_to_target(self, formatted_doc: Doc, few_shot: bool = False): + def doc_to_target(self, formatted_doc: Doc, few_shot: bool = False) -> str: + """ + Returns the target of the given document. + + Args: + formatted_doc (Doc): Formatted document. + few_shot (bool, optional): Whether the document is used for few + shot examples. Defaults to False. + + Returns: + str: Target of the document, which is the correct answer for a document. + """ if few_shot: if formatted_doc.target_for_fewshot_sorting is not None: return formatted_doc.target_for_fewshot_sorting @@ -184,6 +262,16 @@ def doc_to_target(self, formatted_doc: Doc, few_shot: bool = False): # Requests def get_request_type(self) -> list[RequestType]: + """ + Returns the request types for the task. + + Returns: + list[RequestType]: Request types for the task. + + Raises: + NotImplementedError: If the request type is not implemented for the + task. + """ request_types = [] if self.has_metric_category[MetricCategory.TARGET_PERPLEXITY]: request_types.append(RequestType.LOGLIKELIHOOD) @@ -207,16 +295,16 @@ def construct_requests( self, formatted_doc: Doc, context: str, document_id_seed: str, current_task_name: str ) -> List[Request]: """ - Constructs a list of requests based on the given parameters. + Constructs a list of requests from the task based on the given parameters. Args: - formatted_doc (Doc): The formatted document almost straight from the dataset. - ctx (str): The context, which is the few shot examples + the query. - document_id_seed (str): The index of the document in the task appended with the seed used for the few shot sampling. - current_task_name (str): The name of the current task. + formatted_doc (Doc): Formatted document almost straight from the dataset. + ctx (str): Context, which is the few shot examples + the query. + document_id_seed (str): Index of the document in the task appended with the seed used for the few shot sampling. + current_task_name (str): Name of the current task. Returns: - dict[RequestType, List[Request]]: The list of requests. + dict[RequestType, List[Request]]: List of requests. """ requests = {type: [] for type in RequestType} @@ -282,7 +370,17 @@ def construct_requests( return requests - def process_results(self, formatted_doc: Doc, results: list[ModelReturn]): + def process_results(self, formatted_doc: Doc, results: list[ModelReturn]) -> dict[str, float]: + """ + Processes the results of the task, and stores them in the output dict. + + Args: + formatted_doc (Doc): formatted document of the task. + results (list[ModelReturn]): results of the task, returned by the model class after evaluation. + + Returns: + dict[str, float]: output dictionary containing the results of the task. + """ # Metrics management is done in metrics.__init__ outputs = {} if self.has_metric_category[MetricCategory.TARGET_PERPLEXITY]: @@ -319,16 +417,20 @@ def process_results(self, formatted_doc: Doc, results: list[ModelReturn]): return outputs def aggregation(self): + """ + Return a dict with metric name and its aggregation function for all + metrics + """ return Metrics.corpus_level_fns() @staticmethod def load_datasets(tasks: list["LightevalTask"], dataset_loading_processes: int = 1) -> None: """ - Load datasets for the given tasks. + Load datasets from the HuggingFace Hub for the given tasks. Args: tasks (list): A list of tasks. - dataset_loading_processes (int, optional): The number of processes to use for dataset loading. Defaults to 1. + dataset_loading_processes (int, optional): number of processes to use for dataset loading. Defaults to 1. Returns: None @@ -349,6 +451,10 @@ def load_datasets(tasks: list["LightevalTask"], dataset_loading_processes: int = def download_dataset_worker(args): + """ + Worker function to download a dataset from the HuggingFace Hub. + Used for parallel dataset loading. + """ dataset_path, dataset_config_name = args dataset = load_dataset( path=dataset_path, @@ -370,22 +476,29 @@ def create_requests_from_tasks( # noqa: C901 use_chat_template: bool, ) -> Tuple[dict[RequestType, list[Request]], dict[TaskExampleId, Doc]]: """ - Takes a task dict and a fewshot dict and returns a dict of requests, a dict of docs, and a dict of requests origins. - The construction of prompts and thus the managing of few shots is done here. + Takes a task dict and a fewshot dict and returns a dict of requests, a dict + of docs, and a dict of requests origins. The construction of prompts and + thus the managing of few shots is done here. Args: - task_dict (_type_): _description_ - fewshot_dict (_type_): _description_ - num_fewshot_seeds (_type_): _description_ - lm (_type_): _description_ - max_samples (_type_): _description_ - evaluation_tracker (_type_): _description_ + task_dict (dict[str, LightevalTask]): A dictionary of tasks. + fewshot_dict (dict[str, list[Tuple[int, bool]]]): A dictionary of few + shot examples. + num_fewshot_seeds (int): number of few shot seeds. + lm (BaseModel): language model class that will be used to eventually + truncate the few shot examples (we need the maximum input size of the + model) + max_samples (int): maximum number of samples. + evaluation_tracker (EvaluationTracker): evaluation tracker. + use_chat_template (bool): Whether to use the chat template. Raises: - RuntimeError: _description_ + NotImplementedError: If the request type is not implemented for the + task. Returns: - _type_: _description_ + Tuple[dict[RequestType, list[Request]], dict[TaskExampleId, Doc]]: A + tuple containing the requests and the documents. """ docs: dict[TaskExampleId, Doc] = {} requests: dict[RequestType, list[Request]] = collections.defaultdict(list) diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py index 1989584a3..f662bf5a8 100644 --- a/src/lighteval/tasks/registry.py +++ b/src/lighteval/tasks/registry.py @@ -22,11 +22,40 @@ class Registry: - def __init__(self, cache_dir): - self.cache_dir = cache_dir - self.TASK_REGISTRY = {**create_config_tasks(cache_dir=cache_dir)} + """ + The Registry class is used to manage the task registry and get task classes. + """ + + def __init__(self, cache_dir: str): + """ + Initialize the Registry class. + + Args: + cache_dir (str): Directory path for caching. + + Attributes: + cache_dir (str): Directory path for caching. + TASK_REGISTRY (dict[str, LightevalTask]): A dictionary containing the registered tasks. + """ + self.cache_dir: str = cache_dir + self.TASK_REGISTRY: dict[str, LightevalTask] = {**create_config_tasks(cache_dir=cache_dir)} + + def get_task_class( + self, task_name: str, custom_tasks_registry: Optional[dict[str, LightevalTask]] = None + ) -> LightevalTask: + """ + Get the task class based on the task name. + + Args: + task_name (str): Name of the task. + custom_tasks_registry (Optional[dict[str, LightevalTask]]): A dictionary containing custom tasks. + + Returns: + LightevalTask: Task class. - def get_task_class(self, task_name, custom_tasks_registry=None): + Raises: + ValueError: If the task is not found in the task registry or custom task registry. + """ if task_name in self.TASK_REGISTRY: return self.TASK_REGISTRY[task_name] elif custom_tasks_registry is not None and task_name in custom_tasks_registry: @@ -41,14 +70,27 @@ def get_task_class(self, task_name, custom_tasks_registry=None): def get_task_dict( self, task_name_list: List[str], custom_tasks_file: Optional[str] = None ) -> Dict[str, LightevalTask]: - ## todo: make clearer + """ + Get a dictionary of tasks based on the task name list. + + Args: + task_name_list (List[str]): A list of task names. + custom_tasks_file (Optional[str]): Path to the custom tasks file. + + Returns: + Dict[str, LightevalTask]: A dictionary containing the tasks. + + Notes: + - If custom_tasks_file is provided, it will import the custom tasks module and create a custom tasks registry. + - Each task in the task_name_list will be instantiated with the corresponding task class. + """ if custom_tasks_file is not None: dataset_module = dataset_module_factory(str(custom_tasks_file)) custom_tasks_module = importlib.import_module(dataset_module.module_path) custom_tasks_registry = create_config_tasks( meta_table=custom_tasks_module.TASKS_TABLE, cache_dir=self.cache_dir ) - print(custom_tasks_registry) + hlog(custom_tasks_registry) else: custom_tasks_module = None custom_tasks_registry = None @@ -71,8 +113,8 @@ def get_custom_tasks(custom_tasks_file: str) -> Tuple[ModuleType, str]: def taskinfo_selector( - tasks: str, few_shot_default: int = 0 -) -> tuple[list[str], dict[str, list[tuple[int, bool]]], dict[str, str]]: + tasks: str, +) -> tuple[list[str], dict[str, list[tuple[int, bool]]]]: """ Selects task information based on the given tasks and description dictionary path. @@ -82,10 +124,9 @@ def taskinfo_selector( containing a list of tasks. Returns: - tuple[list[str], dict[str, list[tuple[int, bool]]], dict[str, str]]: A tuple containing: + tuple[list[str], dict[str, list[tuple[int, bool]]]]: A tuple containing: - A sorted list of unique task names in the format "suite|task". - A dictionary mapping each task name to a list of tuples representing the few_shot and truncate_few_shots values. - - A dictionary containing the description dictionary loaded from the given path, or an empty dictionary if no path is provided. """ few_shot_dict = collections.defaultdict(list) @@ -117,9 +158,20 @@ def taskinfo_selector( return sorted(few_shot_dict.keys()), {k: list(set(v)) for k, v in few_shot_dict.items()} -def create_config_tasks(meta_table=None, cache_dir: str = None) -> Dict[str, LightevalTask]: - """Creates a dictionary of tasks from a list of subjects - :return: {task_name: task} +def create_config_tasks( + meta_table: Optional[Dataset] = None, cache_dir: Optional[str] = None +) -> Dict[str, LightevalTask]: + """ + Create configuration tasks based on the provided meta_table. + + Args: + meta_table (Optional[Dataset]): meta_table containing task + configurations. If not provided, it will be loaded from TABLE_PATH. + cache_dir (Optional[str]): Directory to store cached data. If not + provided, the default cache directory will be used. + + Returns: + Dict[str, LightevalTask]: A dictionary of task names mapped to their corresponding LightevalTask classes. """ def create_task(name, cfg, cache_dir): diff --git a/src/lighteval/utils.py b/src/lighteval/utils.py index f8ec06655..246510fe9 100644 --- a/src/lighteval/utils.py +++ b/src/lighteval/utils.py @@ -12,12 +12,21 @@ # See the License for the specific language governing permissions and # limitations under the License. import importlib -from typing import Union +from typing import Any, Union import numpy as np -def sanitize_numpy(example_dict): +def sanitize_numpy(example_dict: dict) -> dict: + """ + Sanitizes a dictionary by converting any numpy generic types to their corresponding Python types. + + Args: + example_dict (dict): The dictionary to be sanitized. + + Returns: + dict: The sanitized dictionary with numpy generic types converted to Python types. + """ output_dict = {} for k, v in example_dict.items(): if isinstance(v, np.generic): @@ -27,7 +36,21 @@ def sanitize_numpy(example_dict): return output_dict -def as_list(item): +def as_list(item: Union[list, tuple, Any]) -> list: + """ + Convert the given item into a list. + + If the item is already a list, it is returned as is. + If the item is a tuple, it is converted into a list. + Otherwise, the item is wrapped in a list. + + Args: + item (Union[list, tuple, Any]): The item to be converted. + + Returns: + list: The converted list. + + """ if isinstance(item, list): return item elif isinstance(item, tuple): @@ -35,53 +58,62 @@ def as_list(item): return [item] -def flatten(item: list[Union[list, str]]): +def flatten(item: list[Union[list, str]]) -> list[str]: + """ + Flattens a nested list of strings into a single flat list. + + Args: + item (list[Union[list, str]]): The nested list to be flattened. + + Returns: + list[str]: The flattened list of strings. + """ flat_item = [] for sub_item in item: flat_item.extend(sub_item) if isinstance(sub_item, list) else flat_item.append(sub_item) return flat_item -def is_accelerate_available(): +def is_accelerate_available() -> bool: return importlib.util.find_spec("accelerate") is not None NO_ACCELERATE_ERROR_MSG = "You requested the use of accelerate for this evaluation, but it is not available in your current environement. Please install it using pip." -def is_tgi_available(): +def is_tgi_available() -> bool: return importlib.util.find_spec("text-generation") is not None NO_TGI_ERROR_MSG = "You are trying to start a text generation inference endpoint, but text-generation is not present in your local environement. Please install it using pip." -def is_nanotron_available(): +def is_nanotron_available() -> bool: return importlib.util.find_spec("nanotron") is not None NO_NANOTRON_ERROR_MSG = "YYou requested the use of nanotron for this evaluation, but it is not available in your current environement. Please install it using pip." -def is_optimum_available(): +def is_optimum_available() -> bool: return importlib.util.find_spec("optimum") is not None -def is_bnb_available(): +def is_bnb_available() -> bool: return importlib.util.find_spec("bitsandbytes") is not None NO_BNB_ERROR_MSG = "You are trying to load a model quantized with `bitsandbytes`, which is not available in your local environement. Please install it using pip." -def is_autogptq_available(): +def is_autogptq_available() -> bool: return importlib.util.find_spec("auto-gptq") is not None NO_AUTOGPTQ_ERROR_MSG = "You are trying to load a model quantized with `auto-gptq`, which is not available in your local environement. Please install it using pip." -def is_peft_available(): +def is_peft_available() -> bool: return importlib.util.find_spec("peft") is not None diff --git a/src/lighteval/utils_parallelism.py b/src/lighteval/utils_parallelism.py index a009eae96..7c38df46e 100644 --- a/src/lighteval/utils_parallelism.py +++ b/src/lighteval/utils_parallelism.py @@ -92,6 +92,16 @@ def decorator(*args, **kwargs): def test_all_gather(accelerator=None, parallel_context=None): + """ + Test the gather operation in a parallel setup. + + Args: + accelerator (Optional): The accelerator object used for parallelism. + parallel_context (Optional): The parallel context object used for parallelism. + + Raises: + ImportError: If the required accelerator or parallel context is not available. + """ if accelerator: if not is_accelerate_available(): raise ImportError(NO_ACCELERATE_ERROR_MSG)