Skip to content

Commit

Permalink
Merge branch 'main' into clem_doc_readme
Browse files Browse the repository at this point in the history
  • Loading branch information
NathanHB authored Feb 7, 2024
2 parents ddb17c6 + 8aaf51c commit 2ed3684
Show file tree
Hide file tree
Showing 16 changed files with 481 additions and 470 deletions.
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,5 @@ repos:
rev: 'v0.1.6'
hooks:
- id: ruff
args: ['--fix']
- id: ruff-format
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ However, we are very grateful to the Harness and HELM teams for their continued
- [tests](https://github.com/huggingface/lighteval/tree/main/tests) contains our test suite, that we run at each PR to prevent regressions in metrics/prompts/tasks, for a subset of important tasks.

## How to install and use

Note:
- Use the Eleuther AI Harness (`lm_eval`) to share comparable numbers with everyone (e.g. on the Open LLM Leaderboard).
- Use `lighteval` during training with the nanotron/datatrove LLM training stack and/or for quick eval/benchmark experimentations.
Expand Down
3 changes: 1 addition & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,7 @@ optimum = ["optimum==1.12.0"]
quantization = ["bitsandbytes>=0.41.0", "auto-gptq>=0.4.2"]
adapters = ["peft==0.3.0"]
nanotron = [
"nanotron@git+https://github.com/huggingface/nanotron@8c1a49588d0745a6404644a86547c2dd6a63640e",
"brrr@git+https://github.com/huggingface/brrr@e8a503e2ec08b34eed7522d331aec3bee8cdd29b",
"nanotron@git+https://github.com/huggingface/nanotron",
"tensorboardX"
]

Expand Down
92 changes: 92 additions & 0 deletions run_evals_accelerate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import argparse

from lighteval.main_accelerate import CACHE_DIR, main


def get_parser():
parser = argparse.ArgumentParser()
group = parser.add_mutually_exclusive_group(required=True)
task_type_group = parser.add_mutually_exclusive_group(required=True)

# Model type 1) Base model
weight_type_group = parser.add_mutually_exclusive_group()
weight_type_group.add_argument(
"--delta_weights",
action="store_true",
default=False,
help="set to True of your model should be merged with a base model, also need to provide the base model name",
)
weight_type_group.add_argument(
"--adapter_weights",
action="store_true",
default=False,
help="set to True of your model has been trained with peft, also need to provide the base model name",
)
parser.add_argument(
"--base_model", type=str, default=None, help="name of the base model to be used for delta or adapter weights"
)

task_type_group.add_argument("--model_args")
parser.add_argument("--model_dtype", type=str, default=None)
parser.add_argument(
"--multichoice_continuations_start_space",
action="store_true",
help="Whether to force multiple choice continuations to start with a space",
)
parser.add_argument(
"--no_multichoice_continuations_start_space",
action="store_true",
help="Whether to force multiple choice continuations to not start with a space",
)
parser.add_argument("--use_chat_template", default=False, action="store_true")
# Model type 2) TGI
task_type_group.add_argument("--inference_server_address", type=str)
parser.add_argument("--inference_server_auth", type=str, default=None)
# Model type 3) Inference endpoints
task_type_group.add_argument("--endpoint_model_name", type=str)
parser.add_argument("--accelerator", type=str, default=None)
parser.add_argument("--vendor", type=str, default=None)
parser.add_argument("--region", type=str, default=None)
parser.add_argument("--instance_size", type=str, default=None)
parser.add_argument("--instance_type", type=str, default=None)
parser.add_argument("--reuse_existing", default=False, action="store_true")
# Debug
parser.add_argument("--max_samples", type=int, default=None)
parser.add_argument("--job_id", type=str, help="Optional Job ID for future reference", default="")
# Saving
parser.add_argument("--push_results_to_hub", default=False, action="store_true")
parser.add_argument("--save_details", action="store_true")
parser.add_argument("--push_details_to_hub", default=False, action="store_true")
parser.add_argument(
"--public_run", default=False, action="store_true", help="Push results and details to a public repo"
)
parser.add_argument("--cache_dir", type=str, default=CACHE_DIR)
parser.add_argument(
"--results_org",
type=str,
help="Hub organisation where you want to store the results. Your current token must have write access to it",
)
# Common parameters
parser.add_argument("--output_dir", required=True)
parser.add_argument("--override_batch_size", type=int, default=-1)
parser.add_argument("--dataset_loading_processes", type=int, default=1)
parser.add_argument(
"--custom_tasks_file",
type=str,
default=None,
help="Path to a file with custom tasks (a TASK list of dict and potentially prompt formating functions)",
)
group.add_argument(
"--tasks",
type=str,
default=None,
help="Id of a task, e.g. 'original|mmlu:abstract_algebra|5' or path to a texte file with a list of tasks",
)
parser.add_argument("--num_fewshot_seeds", type=int, default=1, help="Number of trials the few shots")
return parser


if __name__ == "__main__":
parser = get_parser()
args, unknowns = parser.parse_known_args()
main(args)
33 changes: 33 additions & 0 deletions run_evals_nanotron.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# flake8: noqa: C901
import argparse

from lighteval.main_nanotron import main


def get_parser():
parser = argparse.ArgumentParser()
parser.add_argument(
"--checkpoint-config-path",
type=str,
required=True,
help="Path to the brr checkpoint YAML or python config file, potentially on S3",
)
parser.add_argument(
"--lighteval-override",
type=str,
help="Path to an optional YAML or python Lighteval config to override part of the checkpoint Lighteval config",
)
parser.add_argument(
"--cache-dir",
type=str,
default="",
help="Cache directory",
)

return parser


if __name__ == "__main__":
parser = get_parser()
args, unknowns = parser.parse_known_args()
main(args.checkpoint_config_path, args.lighteval_override, args.cache_dir)
31 changes: 31 additions & 0 deletions src/lighteval/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,37 @@ def _sorting_criteria(self, request: GreedyUntilRequest | GreedyUntilWithLogitsR
return -(len(toks) + gen_length)


class GenerativeTaskDatasetNanotron(DynamicBatchDataset):
def __getitem__(self, index) -> Request:
"""
Get an item from the dataset depending on the split we are currently in.
For instance, if we are in split 0, we will get the item at index 0, if
we are in split 1, we will get the item at index self.split_size, etc.
Used for dynamic batching.
Args:
index (int): The index of the item.
Returns:
Any: The item at the specified index.
"""
return index, self.sorted_data[index + self.split_start]

def _sorting_criteria(self, request) -> int:
"""
Collate function for generating batches.
Args:
x (Any): The input data.
Returns:
Any: The collated data.
"""
toks = request.tokenized_context
gen_length = request.generation_size
return -(len(toks) + gen_length)


class GenDistributedSampler(DistributedSampler):
"""A distributed sampler that copy the last element only when drop_last is False so we keep a small padding in the batches
as our samples are sorted by length.
Expand Down
4 changes: 2 additions & 2 deletions src/lighteval/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import copy
from typing import Dict, Union

from pytablewriter import LatexTableWriter, MarkdownTableWriter

from lighteval.logging.evaluation_tracker import EvaluationTracker
from lighteval.logging.hierarchical_logger import hlog
from lighteval.models.base_model import BaseModel
Expand Down Expand Up @@ -99,8 +101,6 @@ def evaluate( # noqa: C901

def make_results_table(result_dict):
"""Generate table of results."""
from pytablewriter import LatexTableWriter, MarkdownTableWriter

md_writer = MarkdownTableWriter()
latex_writer = LatexTableWriter()
md_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"]
Expand Down
150 changes: 74 additions & 76 deletions src/lighteval/logging/evaluation_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,11 @@
TaskConfigLogger,
VersionsLogger,
)
from lighteval.utils import is_nanotron_available
from lighteval.utils import is_nanotron_available, obj_to_markdown


if is_nanotron_available():
from brrr.config import BrrrConfig
from brrr.experiment_loggers import obj_to_markdown
from nanotron.config import get_config_from_dict
from nanotron.config import Config, get_config_from_dict


class EnhancedJSONEncoder(json.JSONEncoder):
Expand Down Expand Up @@ -104,81 +102,81 @@ def save(
"""
hlog("Saving experiment tracker")
try:
date_id = datetime.now().isoformat().replace(":", "-")

output_dir_results = Path(output_dir) / "results" / self.general_config_logger.model_name
output_dir_details = Path(output_dir) / "details" / self.general_config_logger.model_name
output_dir_details_sub_folder = output_dir_details / date_id
output_dir_results.mkdir(parents=True, exist_ok=True)
output_dir_details_sub_folder.mkdir(parents=True, exist_ok=True)

output_results_file = output_dir_results / f"results_{date_id}.json"
output_results_in_details_file = output_dir_details / f"results_{date_id}.json"

hlog(f"Saving results to {output_results_file} and {output_results_in_details_file}")

to_dump = {
"config_general": asdict(self.general_config_logger),
"results": self.metrics_logger.metric_aggregated,
"versions": self.versions_logger.versions,
"config_tasks": self.task_config_logger.tasks_configs,
"summary_tasks": self.details_logger.compiled_details,
"summary_general": asdict(self.details_logger.compiled_details_over_all_tasks),
}
dumped = json.dumps(to_dump, cls=EnhancedJSONEncoder, indent=2)

with open(output_results_file, "w") as f:
f.write(dumped)

with open(output_results_in_details_file, "w") as f:
f.write(dumped)

for task_name, task_details in self.details_logger.details.items():
output_file_details = output_dir_details_sub_folder / f"details_{task_name}_{date_id}.parquet"
# Create a dataset from the dictionary
try:
dataset = Dataset.from_list([asdict(detail) for detail in task_details])
except Exception:
# We force cast to str to avoid formatting problems for nested objects
dataset = Dataset.from_list(
[{k: str(v) for k, v in asdict(detail).items()} for detail in task_details]
)
# try:
date_id = datetime.now().isoformat().replace(":", "-")

# We don't keep 'id' around if it's there
column_names = dataset.column_names
if "id" in dataset.column_names:
column_names = [t for t in dataset.column_names if t != "id"]

# Sort column names to make it easier later
dataset = dataset.select_columns(sorted(column_names))
# Save the dataset to a Parquet file
dataset.to_parquet(output_file_details.as_posix())

if push_results_to_hub:
self.api.upload_folder(
repo_id=self.hub_results_repo if public else self.hub_private_results_repo,
folder_path=output_dir_results,
path_in_repo=self.general_config_logger.model_name,
repo_type="dataset",
commit_message=f"Updating model {self.general_config_logger.model_name}",
)
output_dir_results = Path(output_dir) / "results" / self.general_config_logger.model_name
output_dir_details = Path(output_dir) / "details" / self.general_config_logger.model_name
output_dir_details_sub_folder = output_dir_details / date_id
output_dir_results.mkdir(parents=True, exist_ok=True)
output_dir_details_sub_folder.mkdir(parents=True, exist_ok=True)

if push_details_to_hub:
self.details_to_hub(
model_name=self.general_config_logger.model_name,
results_file_path=output_results_in_details_file,
details_folder_path=output_dir_details_sub_folder,
push_as_public=public,
)
output_results_file = output_dir_results / f"results_{date_id}.json"
output_results_in_details_file = output_dir_details / f"results_{date_id}.json"

hlog(f"Saving results to {output_results_file} and {output_results_in_details_file}")

if push_results_to_tensorboard:
self.push_results_to_tensorboard(
results=self.metrics_logger.metric_aggregated, details=self.details_logger.details
to_dump = {
"config_general": asdict(self.general_config_logger),
"results": self.metrics_logger.metric_aggregated,
"versions": self.versions_logger.versions,
"config_tasks": self.task_config_logger.tasks_configs,
"summary_tasks": self.details_logger.compiled_details,
"summary_general": asdict(self.details_logger.compiled_details_over_all_tasks),
}
dumped = json.dumps(to_dump, cls=EnhancedJSONEncoder, indent=2)

with open(output_results_file, "w") as f:
f.write(dumped)

with open(output_results_in_details_file, "w") as f:
f.write(dumped)

for task_name, task_details in self.details_logger.details.items():
output_file_details = output_dir_details_sub_folder / f"details_{task_name}_{date_id}.parquet"
# Create a dataset from the dictionary
try:
dataset = Dataset.from_list([asdict(detail) for detail in task_details])
except Exception:
# We force cast to str to avoid formatting problems for nested objects
dataset = Dataset.from_list(
[{k: str(v) for k, v in asdict(detail).items()} for detail in task_details]
)
except Exception as e:
hlog("WARNING: Could not save results")
hlog(repr(e))

# We don't keep 'id' around if it's there
column_names = dataset.column_names
if "id" in dataset.column_names:
column_names = [t for t in dataset.column_names if t != "id"]

# Sort column names to make it easier later
dataset = dataset.select_columns(sorted(column_names))
# Save the dataset to a Parquet file
dataset.to_parquet(output_file_details.as_posix())

if push_results_to_hub:
self.api.upload_folder(
repo_id=self.hub_results_repo if public else self.hub_private_results_repo,
folder_path=output_dir_results,
path_in_repo=self.general_config_logger.model_name,
repo_type="dataset",
commit_message=f"Updating model {self.general_config_logger.model_name}",
)

if push_details_to_hub:
self.details_to_hub(
model_name=self.general_config_logger.model_name,
results_file_path=output_results_in_details_file,
details_folder_path=output_dir_details_sub_folder,
push_as_public=public,
)

if push_results_to_tensorboard:
self.push_results_to_tensorboard(
results=self.metrics_logger.metric_aggregated, details=self.details_logger.details
)
# except Exception as e:
# hlog("WARNING: Could not save results")
# hlog(repr(e))

def generate_final_dict(self) -> dict:
"""Aggregates and returns all the logger's experiment information in a dictionary.
Expand Down Expand Up @@ -487,7 +485,7 @@ def push_results_to_tensorboard( # noqa: C901
if not is_nanotron_available():
hlog_warn("You cannot push results to tensorboard with having nanotron installed. Skipping")
return
config: BrrrConfig = get_config_from_dict(self.general_config_logger.config, config_class=BrrrConfig)
config: Config = get_config_from_dict(self.general_config_logger.config, config_class=Config)
lighteval_config = config.lighteval
try:
global_step = config.general.step
Expand Down
Loading

0 comments on commit 2ed3684

Please sign in to comment.