Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into fix-brrr
Browse files Browse the repository at this point in the history
  • Loading branch information
thomwolf committed Feb 6, 2024
2 parents ac4a64a + 059e100 commit fbb2321
Show file tree
Hide file tree
Showing 17 changed files with 460 additions and 165 deletions.
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,8 @@ optimum = ["optimum==1.12.0"]
quantization = ["bitsandbytes>=0.41.0", "auto-gptq>=0.4.2"]
adapters = ["peft==0.3.0"]
nanotron = [
"nanotron@git+https://github.com/huggingface/nanotron@main",
"brrr@git+https://github.com/huggingface/brrr@fix-lighteval",
"nanotron@git+https://github.com/huggingface/nanotron@8c1a49588d0745a6404644a86547c2dd6a63640e",
"brrr@git+https://github.com/huggingface/brrr@e8a503e2ec08b34eed7522d331aec3bee8cdd29b",
"tensorboardX"
]

Expand Down
6 changes: 3 additions & 3 deletions src/lighteval/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def get_original_order(self, new_arr: list) -> list:

return original_order

def get_set_split_start_end(self, split_id: int) -> tuple[int, int]:
def get_split_start_end(self, split_id: int) -> tuple[int, int]:
"""
Get the start and end indices of a dataset split.
Expand All @@ -96,7 +96,7 @@ def splits_start_end_iterator(self) -> tuple[int, int]:
tuple: A tuple containing the start and end indices of a split.
"""
for split_id in range(self.dataset_splits):
yield self.get_set_split_start_end(split_id)
yield self.get_split_start_end(split_id)

def __getitem__(self, index) -> Request:
"""
Expand Down Expand Up @@ -195,7 +195,7 @@ def _sorting_criteria(self, x) -> int:
return -(len(toks) + gen_length)


class GenerativeTaskDatasetBrrr(DynamicBatchDataset):
class GenerativeTaskDatasetNanotron(DynamicBatchDataset):
def _sorting_criteria(self, x) -> int:
"""
Collate function for generating batches.
Expand Down
32 changes: 23 additions & 9 deletions src/lighteval/logging/evaluation_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@


class EnhancedJSONEncoder(json.JSONEncoder):
"""Provides a proper json encoding for the loggers and trackers json dumps.
"""
Provides a proper json encoding for the loggers and trackers json dumps.
Notably manages the json encoding of dataclasses.
"""

Expand All @@ -39,10 +40,16 @@ def default(self, o):


class EvaluationTracker:
"""Keeps track of the overall evaluation process and relevant informations.
The [`EvaluationTracker`] contains specific loggers for experiments details ([`DetailsLogger`]), metrics ([`MetricsLogger`]), task versions ([`VersionsLogger`]) as well as for the general configurations of both the specific task ([`TaskConfigLogger`]) and overall evaluation run ([`GeneralConfigLogger`]).
It compiles the data from these loggers and writes it to files, which can be published to the Hugging Face hub if requested.
"""
Keeps track of the overall evaluation process and relevant informations.
The [`EvaluationTracker`] contains specific loggers for experiments details
([`DetailsLogger`]), metrics ([`MetricsLogger`]), task versions
([`VersionsLogger`]) as well as for the general configurations of both the
specific task ([`TaskConfigLogger`]) and overall evaluation run
([`GeneralConfigLogger`]). It compiles the data from these loggers and
writes it to files, which can be published to the Hugging Face hub if
requested.
"""

details_logger: DetailsLogger
Expand All @@ -53,11 +60,15 @@ class EvaluationTracker:
hub_results_org: str

def __init__(self, hub_results_org: str = "", token: str = "") -> None:
"""Creates all the necessary loggers for evaluation tracking.
"""
Creates all the necessary loggers for evaluation tracking.
Args:
hub_results_org (str): The organisation to push the results to. See more details about the datasets organisation in [`EvaluationTracker.save`]
token (str): Token to use when pushing to the hub. This token should have write access to `hub_results_org`.
hub_results_org (str): The organisation to push the results to. See
more details about the datasets organisation in
[`EvaluationTracker.save`]
token (str): Token to use when pushing to the hub. This token should
have write access to `hub_results_org`.
"""
self.details_logger = DetailsLogger()
self.metrics_logger = MetricsLogger()
Expand All @@ -79,7 +90,8 @@ def save(
) -> None:
"""Saves the experiment information and results to files, and to the hub if requested.
Note: In case of save failure, this function will only print a warning, with the error message.
Note:
In case of save failure, this function will only print a warning, with the error message.
Args:
output_dir (str): Local folder path where you want results to be saved
Expand Down Expand Up @@ -204,6 +216,7 @@ def details_to_hub(
details_folder_path (str or Path): Local path of the current's experiment details folder.
The details folder (created by [`EvaluationTracker.save`]) should contain one parquet file per task used during the evaluation run of the current model.
push_as_public (bool, optional): If True, the results will be pushed publicly, else the datasets will be private.
"""
results_file_path = str(results_file_path)
details_folder_path = str(details_folder_path)
Expand Down Expand Up @@ -255,6 +268,7 @@ def recreate_metadata_card(self, repo_id: str, model_name: str = None) -> None:
Args:
repo_id (str): Details dataset repository path on the hub (`org/dataset`)
model_name (str): Name of the currently evaluated model.
"""
# Add a nice dataset card and the configuration YAML
files_in_repo = self.api.list_repo_files(repo_id=repo_id, repo_type="dataset")
Expand Down
49 changes: 44 additions & 5 deletions src/lighteval/logging/info_loggers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
import time
from dataclasses import asdict, dataclass, field
from typing import Union

import git
import numpy as np
Expand Down Expand Up @@ -38,7 +39,7 @@ class GeneralConfigLogger:
job_id (int): If the evaluation suite is launched as a slurm job, stores the current job id.
Purely informative parameter used to retrieve scheduler logs.
start_time (float): Start time of the experiment. Logged at class init.
end_time (float): Start time of the experiment. Logged when calling [`GeneralConfigLogger.log_end_time`]
end_time (float): End time of the experiment. Logged when calling [`GeneralConfigLogger.log_end_time`]
total_evaluation_time_secondes (str): Inferred total evaluation time in seconds (from the start and end times).
model_name (str): Name of the currently evaluated model.
model_sha (str): Commit hash of the currently evaluated model on the hub if available.
Expand Down Expand Up @@ -72,14 +73,44 @@ def __init__(self) -> None:
self.lighteval_sha = repo.git.rev_parse("HEAD")
self.start_time = time.perf_counter()

def log_args_info(self, num_fewshot_seeds, override_batch_size, max_samples, job_id, config=None) -> None:
def log_args_info(
self,
num_fewshot_seeds: int,
override_batch_size: Union[None, int],
max_samples: Union[None, int],
job_id: str,
config: "BrrrConfig" = None,
) -> None:
"""
Logs the information about the arguments passed to the method.
Args:
num_fewshot_seeds (int): number of few-shot seeds.
override_batch_size (Union[None, int]): overridden batch size.
If strictly positive, its value is used as the batch size for all experiments.
Else, the batch size is automatically inferred depending on what fits in memory.
max_samples (Union[None, int]): maximum number of samples, if None, use all the samples available.
job_id (str): job ID, used to retrieve logs.
config (optional): BrrrConfig
Returns:
None
"""
self.num_fewshot_seeds = num_fewshot_seeds
self.override_batch_size = override_batch_size
self.max_samples = max_samples
self.job_id = job_id
self.config = config

def log_model_info(self, model_info: ModelInfo) -> None:
"""
Logs the model information.
Args:
model_info (ModelInfo): Model information to be logged.
"""
self.model_name = model_info.model_name
self.model_sha = model_info.model_sha
self.model_dtype = model_info.model_dtype
Expand All @@ -102,6 +133,7 @@ class DetailsLogger:
Example: winogrande: [sample1_details, sample2_details, ...]
compiled_details (dict[str, `CompiledDetail`]): : Maps each task name to the list of its samples' compiled details.
compiled_details_over_all_tasks (CompiledDetailOverAllTasks): Aggregated details over all the tasks.
"""

@dataclass()
Expand Down Expand Up @@ -129,6 +161,7 @@ class Detail:
choices (list): List of the possible choices (for multichoice/loglikelihood evaluations)
gold_index (list): Indices of the gold targets among the [`choices`]
metrics (dict): Metric name to current example score
"""

example: str = ""
Expand Down Expand Up @@ -160,9 +193,10 @@ class CompiledDetail:
padded (int): Total umber of samples which needed padding during the batching step for the current task.
non_padded (int): Total number of samples which did not need padding during the batching step for the current task.
effective_few_shots (float): Average effective few shots across all samples for the current task.
The effective few shot is the number of few shots actually used to fit the prompt in the model context
effective few shot is the number of few shots actually used to fit the prompt in the model context
length while allowing model generation of the expected size.
num_truncated_few_shots (int): Total number of samples which required truncated prompts to fit the model size for the current task.
"""

hashes: dict = field(default_factory=dict)
Expand All @@ -186,9 +220,10 @@ class CompiledDetailOverAllTasks:
padded (int): Number of samples which needed padding during the batching step across all tasks.
non_padded (int): Number of samples which did not need padding during the batching step across all tasks.
effective_few_shots (float): Average effective few shots across all samples across all tasks.
The effective few shot is the number of few shots actually used to fit the prompt in the model context
effective few shot is the number of few shots actually used to fit the prompt in the model context
length while allowing model generation of the expected size.
num_truncated_few_shots (int): Number of samples which required truncated prompts to fit the model size across all tasks.
"""

hashes: dict = field(default_factory=dict)
Expand Down Expand Up @@ -388,7 +423,8 @@ def aggregate(self, task_dict: dict[str, LightevalTask], bootstrap_iters: int =
Args:
task_dict (dict[str, LightevalTask]): used to determine what aggregation function to use for each metric
bootstrap_iters (int, optional): _description_. Defaults to 1000.
bootstrap_iters (int, optional): Number of runs used to run the statistical bootstrap. Defaults to 1000.
"""

for task_name, metrics in self.metrics_values.items():
Expand Down Expand Up @@ -440,6 +476,7 @@ class VersionsLogger:
Attributes:
version (dict[str, int]): Maps the task names with the task versions.
"""

# the versions dict will be a dict of task_name: task_version
Expand All @@ -455,6 +492,7 @@ class TaskConfigLogger:
Attributes:
tasks_config (dict[str, TaskConfig]): Maps each task to its associated [`TaskConfig`]
"""

@dataclass
Expand All @@ -479,6 +517,7 @@ class TaskConfig:
truncated_num_docs (bool): Whether less than the total number of documents were used
output_regex (str)
frozen (bool)
"""

name: str
Expand Down
19 changes: 4 additions & 15 deletions src/lighteval/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,12 @@


def apply_target_perplexity_metric(results: list[ModelReturn], formatted_doc: Doc, metrics: list[str]):
if len(formatted_doc.get_golds()) != 1:
raise ValueError("Target perplexity metric can only be used with one gold reference")
outputs = {}
reference_text = formatted_doc.get_golds()[0]
current_result = results.pop(0)
target_logprob = current_result.result[0]
target_acc = current_result.result[1]
current_results = [results.pop(0) for _ in range(len(formatted_doc.get_golds()))]

for metric in metrics:
if Metrics[metric].value.category == MetricCategory.TARGET_PERPLEXITY:
outputs.update(
Metrics[metric].value.compute(
logprobs=target_logprob, target_acc=target_acc, reference_text=reference_text
)
)
if Metrics[metric].value.category == MetricCategory.PERPLEXITY:
outputs.update(Metrics[metric].value.compute(results=current_results))

return results, outputs

Expand All @@ -39,9 +30,7 @@ def apply_perplexity_metric(results: list[ModelReturn], formatted_doc: Doc, metr

for metric in metrics:
if Metrics[metric].value.category == MetricCategory.PERPLEXITY:
outputs.update(
Metrics[metric].value.compute(logprobs=current_result.result, reference_text=reference_text)
)
outputs.update(Metrics[metric].value.compute(results=current_result, reference_text=reference_text))

return results, outputs

Expand Down
2 changes: 1 addition & 1 deletion src/lighteval/metrics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -501,7 +501,7 @@ def higher_is_better():
return res

@staticmethod
def corpus_level_fns():
def corpus_level_fns() -> dict[str, callable]:
res = {}
for metric in Metrics:
if metric.value.category == MetricCategory.IGNORED:
Expand Down
7 changes: 4 additions & 3 deletions src/lighteval/metrics/metrics_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,16 +275,17 @@ def compute(self, choices_logprob: list[float], gold_ixs: list[float], formatted
return 1.0 / (min(ranked_choices) + 1)


def acc_golds_likelihood(target_acc: list[int] | int, **kwargs) -> int:
def acc_golds_likelihood(results: list[tuple[float, int]], **kwargs) -> int:
"""Tests if at least one of predicted gold targets' log-likelihood is above 0.5.
Args:
target_acc (list[int]): List of scores indicating whether the predictions log-probabilities are above 0.5 aggregated.
results (list[int]): List of tuples containing, for each gold, the predictions log-probabilities associated with whether they are above 0.5 aggregated.
formatted_doc (Doc): _description_
Returns:
int: 1 if at least one of the possible golds had a log-likelihood above 0.5.
"""
return max([int(acc_ppl) for acc_ppl in as_list(target_acc)])
return max([int(acc_ppl) for _, acc_ppl in results])


class ROUGE:
Expand Down
6 changes: 3 additions & 3 deletions src/lighteval/metrics/sample_preparator.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,14 +106,14 @@ def count_units(self, text: str) -> int:
if self.units_type == "bytes":
return len(text.encode("utf-8"))

def prepare(self, logprobs: list[float] | float, reference_text: str, **kwargs):
def prepare(self, results, reference_text, **kwargs):
"""Prepares an individual perplexity example to the format expected by metrics computed at the corpus level (aggregated).
Args:
logprobs (list[float]): List of the logprobabilities computed for each item of the sequence or single aggregated logprob over the sequence
results (list[float]): List of the logprobabilities computed for each item
reference_text (str): Current reference text for which to compute the length in self.units_type
Returns:
PerplexityCorpusMetricInput: Stores the measured logprobs and associated text lengths, counted in the reference unit.
"""
return PerplexityCorpusMetricInput(logprobs=logprobs, weights=self.count_units(reference_text))
return PerplexityCorpusMetricInput(logprobs=results.result, weights=self.count_units(reference_text))
Loading

0 comments on commit fbb2321

Please sign in to comment.