Skip to content

Commit

Permalink
Basic evaluate CLI command / codepath (axolotl-ai-cloud#2188)
Browse files Browse the repository at this point in the history
* basic evaluate CLI command / codepath

* tests for evaluate CLI command

* fixes and cleanup

* review comments; slightly DRYing up things

---------

Co-authored-by: Dan Saunders <[email protected]>
  • Loading branch information
djsaunde and Dan Saunders authored Dec 16, 2024
1 parent 3309048 commit f865464
Show file tree
Hide file tree
Showing 10 changed files with 494 additions and 112 deletions.
52 changes: 52 additions & 0 deletions src/axolotl/cli/evaluate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
"""
CLI to run training on a model
"""
import logging
from pathlib import Path
from typing import Union

import fire
from dotenv import load_dotenv
from transformers.hf_argparser import HfArgumentParser

from axolotl.cli import (
check_accelerate_default_config,
check_user_token,
load_cfg,
load_datasets,
load_rl_datasets,
print_axolotl_text_art,
)
from axolotl.common.cli import TrainerCliArgs
from axolotl.evaluate import evaluate

LOG = logging.getLogger("axolotl.cli.evaluate")


def do_evaluate(cfg, cli_args) -> None:
# pylint: disable=duplicate-code
print_axolotl_text_art()
check_accelerate_default_config()
check_user_token()

if cfg.rl: # and cfg.rl != "orpo":
dataset_meta = load_rl_datasets(cfg=cfg, cli_args=cli_args)
else:
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)

evaluate(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)


def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs) -> None:
# pylint: disable=duplicate-code
parsed_cfg = load_cfg(config, **kwargs)
parser = HfArgumentParser(TrainerCliArgs)
parsed_cli_args, _ = parser.parse_args_into_dataclasses(
return_remaining_strings=True
)
do_evaluate(parsed_cfg, parsed_cli_args)


if __name__ == "__main__":
load_dotenv()
fire.Fire(do_cli)
27 changes: 26 additions & 1 deletion src/axolotl/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
build_command,
fetch_from_github,
)
from axolotl.common.cli import PreprocessCliArgs, TrainerCliArgs
from axolotl.common.cli import EvaluateCliArgs, PreprocessCliArgs, TrainerCliArgs
from axolotl.utils.config.models.input.v0_4_1 import AxolotlInputConfig


Expand Down Expand Up @@ -60,6 +60,31 @@ def train(config: str, accelerate: bool, **kwargs):
do_cli(config=config, **kwargs)


@cli.command()
@click.argument("config", type=click.Path(exists=True, path_type=str))
@click.option(
"--accelerate/--no-accelerate",
default=True,
help="Use accelerate launch for multi-GPU training",
)
@add_options_from_dataclass(EvaluateCliArgs)
@add_options_from_config(AxolotlInputConfig)
def evaluate(config: str, accelerate: bool, **kwargs):
"""Evaluate a model."""
kwargs = {k: v for k, v in kwargs.items() if v is not None}

if accelerate:
base_cmd = ["accelerate", "launch", "-m", "axolotl.cli.evaluate"]
if config:
base_cmd.append(config)
cmd = build_command(base_cmd, kwargs)
subprocess.run(cmd, check=True) # nosec B603
else:
from axolotl.cli.evaluate import do_cli

do_cli(config=config, **kwargs)


@cli.command()
@click.argument("config", type=click.Path(exists=True, path_type=str))
@click.option(
Expand Down
25 changes: 19 additions & 6 deletions src/axolotl/common/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,19 @@
LOG = logging.getLogger("axolotl.common.cli")


@dataclass
class PreprocessCliArgs:
"""
dataclass representing arguments for preprocessing only
"""

debug: bool = field(default=False)
debug_text_only: bool = field(default=False)
debug_num_examples: int = field(default=1)
prompter: Optional[str] = field(default=None)
download: Optional[bool] = field(default=True)


@dataclass
class TrainerCliArgs:
"""
Expand All @@ -31,16 +44,14 @@ class TrainerCliArgs:


@dataclass
class PreprocessCliArgs:
class EvaluateCliArgs:
"""
dataclass representing arguments for preprocessing only
dataclass representing the various evaluation arguments
"""

debug: bool = field(default=False)
debug_text_only: bool = field(default=False)
debug_num_examples: int = field(default=1)
prompter: Optional[str] = field(default=None)
download: Optional[bool] = field(default=True)
debug_num_examples: int = field(default=0)


def load_model_and_tokenizer(
Expand All @@ -50,7 +61,9 @@ def load_model_and_tokenizer(
):
LOG.info(f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}")
tokenizer = load_tokenizer(cfg)

LOG.info("loading model and (optionally) peft_config...")
model, _ = load_model(cfg, tokenizer, inference=cli_args.inference)
inference = getattr(cli_args, "inference", False)
model, _ = load_model(cfg, tokenizer, inference=inference)

return model, tokenizer
168 changes: 168 additions & 0 deletions src/axolotl/evaluate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
"""Module for evaluating models."""

import csv
import os
import sys
from pathlib import Path
from typing import Dict, Optional

import torch
from accelerate.logging import get_logger

from axolotl.common.cli import TrainerCliArgs
from axolotl.logging_config import configure_logging
from axolotl.train import TrainDatasetMeta
from axolotl.utils.dict import DictDefault
from axolotl.utils.models import load_model, load_processor, load_tokenizer
from axolotl.utils.trainer import set_pytorch_cuda_alloc_conf, setup_trainer

project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
src_dir = os.path.join(project_root, "src")
sys.path.insert(0, src_dir)

configure_logging()
LOG = get_logger("axolotl.evaluate")


def evaluate_dataset(
trainer, dataset, dataset_type: str, flash_optimum: bool = False
) -> Optional[Dict[str, float]]:
"""Helper function to evaluate a single dataset safely.
Args:
trainer: The trainer instance
dataset: Dataset to evaluate
dataset_type: Type of dataset ('train' or 'eval')
flash_optimum: Whether to use flash optimum
Returns:
Dictionary of metrics or None if dataset is None
"""
if dataset is None:
return None

LOG.info(f"Starting {dataset_type} set evaluation...")

if flash_optimum:
with torch.backends.cuda.sdp_kernel(
enable_flash=True,
enable_math=True,
enable_mem_efficient=True,
):
metrics = trainer.evaluate(dataset, metric_key_prefix=dataset_type)
else:
metrics = trainer.evaluate(dataset, metric_key_prefix=dataset_type)

LOG.info(f"{dataset_type.capitalize()} set evaluation completed!")
LOG.info(f"{dataset_type.capitalize()} Metrics:")
for key, value in metrics.items():
LOG.info(f"{key}: {value}")

return metrics


def evaluate(
*, cfg: DictDefault, cli_args: TrainerCliArgs, dataset_meta: TrainDatasetMeta
) -> Dict[str, float]:
"""
Evaluate a model on training and validation datasets
Args:
cfg: Configuration dictionary
cli_args: Command line arguments
dataset_meta: Dataset metadata containing training and evaluation datasets
Returns:
Tuple containing:
- The model (either PeftModel or PreTrainedModel)
- The tokenizer
- Dictionary of evaluation metrics
"""
# pylint: disable=duplicate-code
# Enable expandable segments for cuda allocation to improve VRAM usage
set_pytorch_cuda_alloc_conf()

# Load tokenizer
LOG.debug(
f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}",
main_process_only=True,
)
tokenizer = load_tokenizer(cfg)

# Load processor for multimodal models if needed
processor = None
if cfg.is_multimodal:
processor = load_processor(cfg, tokenizer)

# Get datasets
train_dataset = dataset_meta.train_dataset
eval_dataset = dataset_meta.eval_dataset
total_num_steps = dataset_meta.total_num_steps

# Load model
LOG.debug("loading model for evaluation...")
model, _ = load_model(
cfg, tokenizer, processor=processor, inference=cli_args.inference
)

# Set up trainer
trainer = setup_trainer(
cfg,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
model=(model, None, None), # No need for model_ref or peft_config
tokenizer=tokenizer,
processor=processor,
total_num_steps=total_num_steps,
)

# Evaluate datasets
all_metrics = {}
train_metrics = evaluate_dataset(trainer, train_dataset, "train", cfg.flash_optimum)
eval_metrics = evaluate_dataset(trainer, eval_dataset, "eval", cfg.flash_optimum)

if train_metrics:
all_metrics.update(train_metrics)
if eval_metrics:
all_metrics.update(eval_metrics)

# Save metrics to CSV if output directory is specified and we have metrics
if cfg.output_dir and (train_metrics or eval_metrics):
output_dir = Path(cfg.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)

metrics_file = output_dir / "eval_summary.csv"
with metrics_file.open("w", newline="", encoding="utf-8") as file:
writer = csv.writer(file)
writer.writerow(["metric", "training", "validation"])

# Get unique metric names (removing prefixes) from available metrics
train_metric_names = {
k.replace("train_", ""): k for k in (train_metrics or {})
}
eval_metric_names = {
k.replace("eval_", ""): k for k in (eval_metrics or {})
}
all_metric_names = sorted(
set(train_metric_names.keys()) | set(eval_metric_names.keys())
)

for metric_name in all_metric_names:
train_value = (
train_metrics.get(train_metric_names.get(metric_name, ""), "")
if train_metrics
else ""
)
eval_value = (
eval_metrics.get(eval_metric_names.get(metric_name, ""), "")
if eval_metrics
else ""
)
writer.writerow([metric_name, train_value, eval_value])

LOG.info(f"Evaluation results saved to {metrics_file}")

del model
del tokenizer

return all_metrics
19 changes: 8 additions & 11 deletions src/axolotl/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from axolotl.utils.dict import DictDefault
from axolotl.utils.freeze import freeze_layers_except
from axolotl.utils.models import load_model, load_processor, load_tokenizer
from axolotl.utils.trainer import setup_trainer
from axolotl.utils.trainer import set_pytorch_cuda_alloc_conf, setup_trainer

try:
from optimum.bettertransformer import BetterTransformer
Expand Down Expand Up @@ -53,25 +53,22 @@ class TrainDatasetMeta:
def train(
*, cfg: DictDefault, cli_args: TrainerCliArgs, dataset_meta: TrainDatasetMeta
) -> Tuple[Union[PeftModel, PreTrainedModel], PreTrainedTokenizer]:
# enable expandable segments for cuda allocation to improve VRAM usage
torch_version = torch.__version__.split(".")
torch_major, torch_minor = int(torch_version[0]), int(torch_version[1])
if torch_major == 2 and torch_minor >= 2:
if os.getenv("PYTORCH_CUDA_ALLOC_CONF") is None:
os.environ[
"PYTORCH_CUDA_ALLOC_CONF"
] = "expandable_segments:True,roundup_power2_divisions:16"

# load the tokenizer first
# Enable expandable segments for cuda allocation to improve VRAM usage
set_pytorch_cuda_alloc_conf()

# Load tokenizer
LOG.debug(
f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}",
main_process_only=True,
)
tokenizer = load_tokenizer(cfg)

# Load processor for multimodal models if needed
processor = None
if cfg.is_multimodal:
processor = load_processor(cfg, tokenizer)

# Get datasets
train_dataset = dataset_meta.train_dataset
eval_dataset = dataset_meta.eval_dataset
total_num_steps = dataset_meta.total_num_steps
Expand Down
3 changes: 3 additions & 0 deletions src/axolotl/utils/data/sft.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,9 @@ def prepare_dataset(cfg, tokenizer, processor=None):
eval_dataset = None
if cfg.dataset_exact_deduplication:
LOG.info("Deduplication not available for pretrained datasets")

return train_dataset, eval_dataset, cfg.max_steps, prompters

if eval_dataset and cfg.sample_packing and cfg.eval_sample_packing is not False:
total_eval_steps = calculate_total_num_steps(cfg, eval_dataset, update=False)
if total_eval_steps == 0:
Expand All @@ -134,6 +136,7 @@ def prepare_dataset(cfg, tokenizer, processor=None):
LOG.info(f"Maximum number of steps set at {total_num_steps}")
else:
total_num_steps = calculate_total_num_steps(cfg, train_dataset)

return train_dataset, eval_dataset, total_num_steps, prompters


Expand Down
11 changes: 11 additions & 0 deletions src/axolotl/utils/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -512,6 +512,17 @@ def prepare_opinionated_env(cfg):
os.environ["TOKENIZERS_PARALLELISM"] = "false"


def set_pytorch_cuda_alloc_conf():
"""Set up CUDA allocation config if using PyTorch >= 2.2"""
torch_version = torch.__version__.split(".")
torch_major, torch_minor = int(torch_version[0]), int(torch_version[1])
if torch_major == 2 and torch_minor >= 2:
if os.getenv("PYTORCH_CUDA_ALLOC_CONF") is None:
os.environ[
"PYTORCH_CUDA_ALLOC_CONF"
] = "expandable_segments:True,roundup_power2_divisions:16"


def setup_trainer(
cfg, train_dataset, eval_dataset, model, tokenizer, processor, total_num_steps
):
Expand Down
Loading

0 comments on commit f865464

Please sign in to comment.