Skip to content

Commit

Permalink
redo logging (#415)
Browse files Browse the repository at this point in the history
- revamped the logging with a config modifyng every loggers used in lighteval
- made accelerate a default requirement
- fixed some documentation
  • Loading branch information
NathanHB authored Dec 5, 2024
1 parent b68d5bc commit 1fb7968
Show file tree
Hide file tree
Showing 33 changed files with 345 additions and 404 deletions.
24 changes: 14 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ Hub, S3, or locally.
## ⚡️ Installation

```bash
pip install lighteval[accelerate]
pip install lighteval
```

Lighteval allows for many extras when installing, see [here](https://github.com/huggingface/lighteval/wiki/Installation) for a complete list.
Expand All @@ -71,20 +71,24 @@ huggingface-cli login

Lighteval offers two main entry points for model evaluation:


* `lighteval accelerate`: evaluate models on CPU or one or more GPUs using [🤗
Accelerate](https://github.com/huggingface/accelerate).
* `lighteval nanotron`: evaluate models in distributed settings using [⚡️
Nanotron](https://github.com/huggingface/nanotron).
- `lighteval accelerate` : evaluate models on CPU or one or more GPUs using [🤗
Accelerate](https://github.com/huggingface/accelerate)
- `lighteval nanotron`: evaluate models in distributed settings using [⚡️
Nanotron](https://github.com/huggingface/nanotron)
- `lighteval vllm`: evaluate models on one or more GPUs using [🚀
VLLM](https://github.com/vllm-project/vllm)
- `lighteval endpoint`
- `inference-endpoint`: evaluate models on one or more GPUs using [🔗
Inference Endpoint](https://huggingface.co/inference-endpoints/dedicated)
- `tgi`: evaluate models on one or more GPUs using [🔗 Text Generation Inference](https://huggingface.co/docs/text-generation-inference/en/index)
- `openai`: evaluate models on one or more GPUs using [🔗 OpenAI API](https://platform.openai.com/)

Here’s a quick command to evaluate using the Accelerate backend:

```shell
lighteval accelerate \
--model_args "pretrained=gpt2" \
--tasks "leaderboard|truthfulqa:mc|0|0" \
--override_batch_size 1 \
--output_dir="./evals/"
"pretrained=gpt2" \
"leaderboard|truthfulqa:mc|0|0"
```

## 🙏 Acknowledgements
Expand Down
1 change: 0 additions & 1 deletion docs/source/installation.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ appropriate extras group.

| extra name | description |
|--------------|---------------------------------------------------------------------------|
| accelerate | To use accelerate for model and data parallelism with transformers models |
| tgi | To use Text Generation Inference API to evaluate your model |
| nanotron | To evaluate nanotron models |
| quantization | To evaluate quantized models |
Expand Down
5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ keywords = ["evaluation", "nlp", "llm"]
dependencies = [
# Base dependencies
"transformers>=4.38.0",
"accelerate",
"huggingface_hub>=0.23.0",
"torch>=2.0,<2.5",
"GitPython>=3.1.41", # for logging
Expand All @@ -64,7 +65,8 @@ dependencies = [
"typer",
"termcolor==2.3.0",
"pytablewriter",
"colorama",
"rich",
"colorlog",
# Extension of metrics
"aenum==3.1.15",
# Base metrics
Expand All @@ -80,7 +82,6 @@ dependencies = [
]

[project.optional-dependencies]
accelerate = ["accelerate"]
tgi = ["text-generation==0.6.0"]
optimum = ["optimum==1.12.0"]
quantization = ["bitsandbytes>=0.41.0", "auto-gptq>=0.4.2"]
Expand Down
26 changes: 26 additions & 0 deletions src/lighteval/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import logging
from logging.config import dictConfig

import colorlog
import typer

import lighteval.main_accelerate
Expand All @@ -32,6 +35,29 @@

app = typer.Typer()

logging_config = dict( # noqa C408
version=1,
formatters={
"c": {
"()": colorlog.ColoredFormatter,
"format": "[%(asctime)s] [%(log_color)s%(levelname)8s%(reset)s]: %(message)s (%(filename)s:%(lineno)s)",
"log_colors": {
"DEBUG": "cyan",
"INFO": "green",
"WARNING": "yellow",
"ERROR": "red",
"CRITICAL": "red,bg_white",
},
},
},
handlers={"h": {"class": "logging.StreamHandler", "formatter": "c", "level": logging.INFO}},
root={
"handlers": ["h"],
"level": logging.INFO,
},
)

dictConfig(logging_config)

app.command(rich_help_panel="Evaluation Backends")(lighteval.main_accelerate.accelerate)
app.command(rich_help_panel="Evaluation Utils")(lighteval.main_baseline.baseline)
Expand Down
9 changes: 6 additions & 3 deletions src/lighteval/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,14 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import logging
import math
from typing import Iterator, Tuple

import torch
from torch.utils.data import Dataset
from torch.utils.data.distributed import DistributedSampler, T_co

from lighteval.logging.hierarchical_logger import hlog_warn
from lighteval.tasks.requests import (
GreedyUntilRequest,
LoglikelihoodRequest,
Expand All @@ -37,6 +37,9 @@
)


logger = logging.getLogger(__name__)


class DynamicBatchDataset(Dataset):
def __init__(
self,
Expand Down Expand Up @@ -76,7 +79,7 @@ def __init__(

def init_split_limits(self, num_dataset_splits):
if num_dataset_splits >= self.total_size:
hlog_warn(
logger.warning(
f"num_dataset_splits ({num_dataset_splits}) >= total_size ({self.total_size}), setting num_dataset_splits to 1"
)
num_dataset_splits = 1
Expand Down Expand Up @@ -247,7 +250,7 @@ def init_split_limits(self, num_dataset_splits):
_type_: _description_
"""
if num_dataset_splits is not None:
hlog_warn(
logger.warning(
"You cannot select the number of dataset splits for a generative evaluation at the moment. Automatically inferring."
)

Expand Down
24 changes: 13 additions & 11 deletions src/lighteval/logging/evaluation_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

import copy
import json
import logging
import os
import re
import time
Expand All @@ -37,7 +38,6 @@
from fsspec import url_to_fs
from huggingface_hub import DatasetCard, DatasetCardData, HfApi, HFSummaryWriter, hf_hub_url

from lighteval.logging.hierarchical_logger import hlog, hlog_warn
from lighteval.logging.info_loggers import (
DetailsLogger,
GeneralConfigLogger,
Expand All @@ -49,6 +49,8 @@
from lighteval.utils.utils import obj_to_markdown


logger = logging.getLogger(__name__)

if is_nanotron_available():
from nanotron.config import GeneralArgs # type: ignore

Expand Down Expand Up @@ -147,7 +149,7 @@ def __init__(

def save(self) -> None:
"""Saves the experiment information and results to files, and to the hub if requested."""
hlog("Saving experiment tracker")
logger.info("Saving experiment tracker")
date_id = datetime.now().isoformat().replace(":", "-")

# We first prepare data to save
Expand Down Expand Up @@ -202,15 +204,15 @@ def save_results(self, date_id: str, results_dict: dict):
output_dir_results = Path(self.output_dir) / "results" / self.general_config_logger.model_name
self.fs.mkdirs(output_dir_results, exist_ok=True)
output_results_file = output_dir_results / f"results_{date_id}.json"
hlog(f"Saving results to {output_results_file}")
logger.info(f"Saving results to {output_results_file}")
with self.fs.open(output_results_file, "w") as f:
f.write(json.dumps(results_dict, cls=EnhancedJSONEncoder, indent=2, ensure_ascii=False))

def save_details(self, date_id: str, details_datasets: dict[str, Dataset]):
output_dir_details = Path(self.output_dir) / "details" / self.general_config_logger.model_name
output_dir_details_sub_folder = output_dir_details / date_id
self.fs.mkdirs(output_dir_details_sub_folder, exist_ok=True)
hlog(f"Saving details to {output_dir_details_sub_folder}")
logger.info(f"Saving details to {output_dir_details_sub_folder}")
for task_name, dataset in details_datasets.items():
output_file_details = output_dir_details_sub_folder / f"details_{task_name}_{date_id}.parquet"
with self.fs.open(str(output_file_details), "wb") as f:
Expand Down Expand Up @@ -255,7 +257,7 @@ def push_to_hub(

if not self.api.repo_exists(repo_id):
self.api.create_repo(repo_id, private=not (self.public), repo_type="dataset", exist_ok=True)
hlog(f"Repository {repo_id} not found, creating it.")
logger.info(f"Repository {repo_id} not found, creating it.")

# We upload it both as a json and a parquet file
result_file_base_name = f"results_{date_id}"
Expand Down Expand Up @@ -490,11 +492,11 @@ def push_to_tensorboard( # noqa: C901
self, results: dict[str, dict[str, float]], details: dict[str, DetailsLogger.CompiledDetail]
):
if not is_tensorboardX_available:
hlog_warn(NO_TENSORBOARDX_WARN_MSG)
logger.warning(NO_TENSORBOARDX_WARN_MSG)
return

if not is_nanotron_available():
hlog_warn("You cannot push results to tensorboard without having nanotron installed. Skipping")
logger.warning("You cannot push results to tensorboard without having nanotron installed. Skipping")
return

prefix = self.tensorboard_metric_prefix
Expand Down Expand Up @@ -526,14 +528,14 @@ def push_to_tensorboard( # noqa: C901
bench_suite = None
if ":" in task_name:
bench_suite = task_name.split(":")[0] # e.g. MMLU
hlog(f"bench_suite {bench_suite} in {task_name}")
logger.info(f"bench_suite {bench_suite} in {task_name}")
for metric, value in values.items():
if "stderr" in metric:
continue
if bench_suite not in bench_averages:
bench_averages[bench_suite] = {}
bench_averages[bench_suite][metric] = bench_averages[bench_suite].get(metric, []) + [float(value)]
hlog(f"Pushing {task_name} {values} to tensorboard")
logger.info(f"Pushing {task_name} {values} to tensorboard")
for metric, value in values.items():
if "stderr" in metric:
tb_context.add_scalar(f"stderr_{prefix}/{task_name}/{metric}", value, global_step=global_step)
Expand All @@ -546,7 +548,7 @@ def push_to_tensorboard( # noqa: C901
# Tasks with subtasks
for name, values in bench_averages.items():
for metric, values in values.items():
hlog(f"Pushing average {name} {metric} {sum(values) / len(values)} to tensorboard")
logger.info(f"Pushing average {name} {metric} {sum(values) / len(values)} to tensorboard")
tb_context.add_scalar(f"{prefix}/{name}/{metric}", sum(values) / len(values), global_step=global_step)

tb_context.add_text("eval_config", obj_to_markdown(results), global_step=global_step)
Expand All @@ -571,7 +573,7 @@ def push_to_tensorboard( # noqa: C901

# Now we can push to the hub
tb_context.scheduler.trigger()
hlog(
logger.info(
f"Pushed to tensorboard at https://huggingface.co/{self.tensorboard_repo}/{output_dir_tb}/tensorboard"
f" at global_step {global_step}"
)
Loading

0 comments on commit 1fb7968

Please sign in to comment.