Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BAAI] refine resnet50-tf2, bugfix and add performance metrics #388

Merged
merged 6 commits into from
Jan 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions training/benchmarks/driver/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ def setup_config(self, parser):
if known_args.extern_module_dir:
mod_util.install_extern_modules(known_args.extern_module_dir,
self.extern_modules)

if not hasattr(self.config, "local_rank"):
self.config.local_rank = 0

self.logger = perf_logger.PerfLogger.get_default_logger(
rank=self.config.local_rank)

Expand Down
2 changes: 1 addition & 1 deletion training/benchmarks/driver/config_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ def activate(base_config,
enable_extern_config)

# TODO:后续考虑换一个更优雅的方式
if "tensorflow2" in base_config.__path__:
if hasattr(base_config, "__path__") and "tensorflow2" in base_config.__path__:
base_config.override(parsed_params.__dict__, False)
else:
_merge_dict_to_config(parsed_params.__dict__, base_config.__dict__)
Expand Down
28 changes: 22 additions & 6 deletions training/benchmarks/resnet50/tensorflow2/configs/configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from core import dataset_factory
from . import base_configs
from resnet import resnet_config
from absl import flags


@dataclasses.dataclass
Expand All @@ -36,11 +37,11 @@ class ResNetImagenetConfig(base_configs.ExperimentConfig):
mean_subtract=True,
standardize=True)
train: base_configs.TrainConfig = base_configs.TrainConfig(
resume_checkpoint=True,
resume_checkpoint=False,
epochs=90,
steps=None,
callbacks=base_configs.CallbacksConfig(
enable_checkpoint_and_export=True, enable_tensorboard=True),
enable_checkpoint_and_export=False, enable_tensorboard=True),
metrics=['accuracy', 'top_5'],
time_history=base_configs.TimeHistoryConfig(log_steps=100),
tensorboard=base_configs.TensorBoardConfig(track_lr=True,
Expand All @@ -49,14 +50,29 @@ class ResNetImagenetConfig(base_configs.ExperimentConfig):
evaluation: base_configs.EvalConfig = base_configs.EvalConfig(
epochs_between_evals=1, steps=None)
model: base_configs.ModelConfig = resnet_config.ResNetModelConfig()
do_train: str = True
target_accuracy: float = 0.76


def get_config(model: str, dataset: str) -> base_configs.ExperimentConfig:
def get_config(flags_obj: flags.FlagValues, model: str, dataset: str) -> base_configs.ExperimentConfig:
"""Given model and dataset names, return the ExperimentConfig."""
resnet_config = ResNetImagenetConfig( mode="train_and_eval",
model_dir="result",
train_dataset=dataset_factory.ImageNetConfig(
split='train',
data_dir = flags_obj.data_dir,
one_hot=False,
mean_subtract=True,
standardize=True),
validation_dataset=dataset_factory.ImageNetConfig(
split='validation',
data_dir = flags_obj.data_dir,
one_hot=False,
mean_subtract=True,
standardize=True),
)
dataset_model_config_map = {
'imagenet': {
'resnet': ResNetImagenetConfig(),
}
'imagenet': { 'resnet': resnet_config }
}
try:
return dataset_model_config_map[dataset][model]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def get_models() -> Mapping[str, tf.keras.Model]:

def convert_model(params):
model_params = params.model.model_params.as_dict()
model = get_models()[params.model.name](**model_params)
model = get_models()[params.model.name.lower()](**model_params)
return model


Expand Down
1 change: 1 addition & 0 deletions training/benchmarks/resnet50/tensorflow2/resnet/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ def build_stats(history, eval_output, callbacks):
# timestamp_log = callback.timestamp_log
# stats['step_timestamp_log'] = timestamp_log
stats['train_finish_time'] = callback.train_finish_time
stats['num_trained_samples'] = callback.num_trained_samples
if callback.epoch_runtime_log:
stats[
'avg_exp_per_second'] = callback.average_examples_per_second
Expand Down
44 changes: 28 additions & 16 deletions training/benchmarks/resnet50/tensorflow2/run_pretraining.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@
from configs import base_configs
from configs import configs
from resnet import common
from resnet import resnet_model
from modeling import hyperparams
from modeling import performance
from core import trainer_adapter
Expand All @@ -47,7 +46,6 @@

logger = None


def get_dtype_map() -> Mapping[str, tf.dtypes.DType]:
"""Returns the mapping from dtype string representations to TF dtypes."""
return {
Expand Down Expand Up @@ -140,24 +138,23 @@ def check_must_envconfigs(params):
", ".join(must_configs))
params.local_rank = int(os.environ['FLAGPERF_NODE_RANK'])
params.runtime.num_gpus = int(os.environ["FLAGPERF_NPROC"])

if params.runtime.distribution_strategy == 'multi_worker_mirrored':
hosts = os.environ["FLAGPERF_HOSTS"].split(",")
ports = os.environ["FLAGPERF_HOSTS_PORTS"].split(",")
params.runtime.worker_hosts = ",".join(
[hosts[i] + ":" + ports[i] for i in range(len(hosts))])
[hosts[i] + ":" + ports[0] for i in range(len(hosts))])
params.runtime.task_index = int(os.environ['FLAGPERF_NODE_RANK'])

return params


def _get_params_from_flags(flags_obj: flags.FlagValues):
"""Get ParamsDict from flags."""

global logger
pp = pprint.PrettyPrinter()

params = configs.get_config(model='resnet', dataset='imagenet')
logging.info('Base params: %s', pp.pformat(params.as_dict()))
params = configs.get_config(flags_obj, model='resnet', dataset='imagenet')
logging.info('_get_params_from_flags Base params: %s', pp.pformat(params.as_dict()))

driver = Driver(params, [])
driver.setup_config(argparse.ArgumentParser("renset50"))
Expand Down Expand Up @@ -230,7 +227,6 @@ def initialize(params: base_configs.ExperimentConfig,
if params.runtime.batchnorm_spatial_persistent:
os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'


def define_classifier_flags():
"""Defines common flags for image classification."""
hyperparams_flags.initialize_common_flags()
Expand Down Expand Up @@ -267,7 +263,9 @@ def define_classifier_flags():
flags.DEFINE_string('extern_module_dir',
default=None,
help='The extern module dir.')

flags.DEFINE_string('vendor',
default=None,
help='AI chip vendor')

def serialize_config(params: base_configs.ExperimentConfig, model_dir: str):
"""Serializes and saves the experiment config."""
Expand Down Expand Up @@ -341,7 +339,8 @@ def train_and_eval(params: base_configs.ExperimentConfig,
steps_per_execution=steps_per_loop)

initial_epoch = 0
if params.train.resume_checkpoint:
print("params.train.resume_checkpoint", params.train.resume_checkpoint)
if hasattr(params.train, "resume_checkpoint") and params.train.resume_checkpoint:
initial_epoch = resume_from_checkpoint(
model=model,
model_dir=params.model_ckpt_dir,
Expand Down Expand Up @@ -374,19 +373,24 @@ def train_and_eval(params: base_configs.ExperimentConfig,
driver.event(Event.INIT_END)
init_end_time = logger.previous_log_time
params.init_time = (init_end_time - init_start_time) / 1e+3

raw_train_start_time = logger.previous_log_time

# train
stats_training = dict()
raw_train_start_time = time.time()
driver.event(Event.TRAIN_START)
# model.fit()一次性加载整个数据集到内存中,适用于数据集较小,不会导致内存溢出的情况。
history = model.fit(train_dataset,
epochs=train_epochs,
steps_per_epoch=train_steps,
initial_epoch=initial_epoch,
callbacks=callbacks,
verbose=2,
**validation_kwargs)
stats_training['no_eval_time'] = time.time() - raw_train_start_time

driver.event(Event.TRAIN_END)

# evaluate
validation_output = None
if not params.evaluation.skip_eval:
validation_output = model.evaluate(validation_dataset,
Expand All @@ -395,8 +399,11 @@ def train_and_eval(params: base_configs.ExperimentConfig,

# TODO(dankondratyuk): eval and save final test accuracy
stats = common.build_stats(history, validation_output, callbacks)
raw_train_end_time = logger.previous_log_time
params.raw_train_time = (raw_train_end_time - raw_train_start_time) / 1e+3
params.raw_train_time = time.time() - raw_train_start_time
stats['no_eval_time'] = stats_training['no_eval_time']
stats['raw_train_time'] = params.raw_train_time
stats['pure_compute_time'] = stats_training['no_eval_time']

return stats, params


Expand Down Expand Up @@ -427,7 +434,6 @@ def run(flags_obj: flags.FlagValues,
"""

params, driver = _get_params_from_flags(flags_obj)

if params.mode == 'train_and_eval':
return train_and_eval(params, strategy_override, driver)
elif params.mode == 'export_only':
Expand All @@ -446,7 +452,13 @@ def main(_):
if params.do_train:
finished_info = {
"e2e_time": e2e_time,
"training_sequences_per_second": stats['avg_exp_per_second'],
"num_trained_samples": stats['num_trained_samples'],
"train_time": stats['raw_train_time'],
"train_no_eval_time": stats['no_eval_time'],
"pure_training_computing_time": stats['pure_compute_time'],
"throughput(ips)_raw": stats['num_trained_samples'] / stats['raw_train_time'],
"throughput(ips)_no_eval": stats['num_trained_samples'] / stats['no_eval_time'],
"throughput(ips)_pure_compute": stats['num_trained_samples'] / stats['pure_compute_time'],
"converged": stats["converged"],
"final_accuracy": stats["accuracy_top_1"],
"final_loss": stats["eval_loss"],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,11 @@ def global_steps(self):
"""The current 1-indexed global step."""
return self.steps_before_epoch + self.steps_in_epoch

@property
def num_trained_samples(self):
"""number of trained samples"""
return self.global_steps * self.batch_size

@property
def average_steps_per_second(self):
"""The average training steps per second across all epochs."""
Expand Down
50 changes: 50 additions & 0 deletions training/nvidia/resnet50-tensorflow2/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
### 1. 数据集准备
[数据集准备](../../benchmarks/resnet50/tensorflow2/README.md#2数据集准备)

### 2. Nvidia GPU配置与运行信息参考
#### 环境配置
- ##### 硬件环境
- 机器型号: NVIDIA DGX A100(40G)
- 加速卡型号: NVIDIA_A100-SXM4-40GB
- CPU型号: AMD [email protected]
- 多机网络类型、带宽: InfiniBand,200Gb/s

- ##### 软件环境
- OS版本:Ubuntu 20.04
- OS kernel版本: 5.4.0-113-generic
- 加速卡驱动版本:470.129.06
- Docker 版本:20.10.16
- 训练框架版本:tensorflow 2.6.0+nv
- 依赖软件版本:
- cuda: 11.4


### 运行情况

* 通用指标

| 指标名称 | 指标值 | 特殊说明 |
| -------------- | ----------------------- | ----------------------------------------------- |
| 任务类别 | 图像分类 | |
| 模型 | resnet50 | |
| 数据集 | ImageNet2012 | |
| 数据精度 | precision,见“性能指标” | 可选fp32/amp/fp16 |
| 超参修改 | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 |
| 硬件设备简称 | nvidia A100 | |
| 硬件存储使用 | mem,见“性能指标” | 通常称为“显存”,单位为GiB |
| 端到端时间 | e2e_time,见“性能指标” | 总时间+Perf初始化等时间 |
| 总吞吐量 | p_whole,见“性能指标” | 实际训练图片数除以总时间(performance_whole) |
| 训练吞吐量 | p_train,见“性能指标” | 不包含每个epoch末尾的评估部分耗时 |
| **计算吞吐量** | **p_core,见“性能指标”** | 不包含数据IO部分的耗时(p_core>=p_train>p_whole) |
| 训练结果 | acc1,见“性能指标” | 单位为top1分类准确率(acc1) |
| 额外修改项 | 无 | |

* 性能指标

| 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | acc1 | mem |
| ------------------- | --------- | ------ | -------- | ------- | ------- | ------ | ----- | --------- |
| A100单机单卡(1x1) | fp32 | / | / | 746.3 | 748.1 | 748.1 | / | 39.3/40.0 |
| A100单机8卡(1x8) | fp32 | / | 18578 | 5175.1 | 5176.7 | 5176.7 | 75.5% | 39.5/40.0 |



59 changes: 59 additions & 0 deletions training/nvidia/resnet50-tensorflow2/config/config_A100x1x1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# Training configuration for ResNet trained on ImageNet on GPUs.
# Reaches > 76.1% within 90 epochs.
# Note = This configuration uses a scaled per-replica batch size based on the number of devices.
# Base params = base_configs.ExperimentConfig
do_train = True
model_dir = 'result'
model_ckpt_dir = ' '
mode = 'train_and_eval'
target_accuracy: float = 0.76
# runtime = dict(
# distribution_strategy = 'multi_worker_mirrored',
# run_eagerly = None,
# tpu = None,
# batchnorm_spatial_persistent = True)
runtime = dict(distribution_strategy='mirrored',
run_eagerly=None,
tpu=None,
batchnorm_spatial_persistent=True)
train_dataset = dict(name='imagenet2012',
data_dir='/raid/dataset/ImageNet2012/tf_records',
builder='records',
split='train',
image_size=224,
num_classes=1000,
num_examples=1281167,
batch_size=128,
use_per_replica_batch_size=True,
dtype='float16',
mean_subtrat=True,
standardize=True)
validation_dataset = dict(
name='imagenet2012',
data_dir='/raid/dataset/ImageNet2012/tf_records',
builder='records',
split='validation',
image_size=224,
num_classes=1000,
num_examples=50000,
batch_size=128, #256
use_per_replica_batch_size=True,
dtype='float16',
mean_subtract=True,
standardize=True)
model = dict(name='resnet',
model_params=dict(rescale_inputs=False),
optimizer=dict(name='momentum',
momentum=0.9,
decay=0.9,
epsilon=0.001),
loss=dict(label_smoothing=0.1))
train = dict(resume_checkpoint=True,
epochs=90,
time_history=dict(log_steps=100),
callbacks=dict(enable_checkpoint_and_export=False,
enable_backup_and_restore=False))
evaluation = dict(epochs_between_evals=1)

# local_rank for distributed training on gpus
local_rank: int = -1 ## for log
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
model_dir = 'result'
model_ckpt_dir = ' '
mode = 'train_and_eval'
target_accuracy: float = 0.75
target_accuracy: float = 0.76
# runtime = dict(
# distribution_strategy = 'multi_worker_mirrored',
# run_eagerly = None,
Expand All @@ -17,7 +17,7 @@
tpu=None,
batchnorm_spatial_persistent=True)
train_dataset = dict(name='imagenet2012',
data_dir='/mnt/data/ImageNet2012/tf_records',
data_dir='/raid/dataset/ImageNet2012/tf_records',
builder='records',
split='train',
image_size=224,
Expand All @@ -30,7 +30,7 @@
standardize=True)
validation_dataset = dict(
name='imagenet2012',
data_dir='/mnt/data/ImageNet2012/tf_records',
data_dir='/raid/dataset/ImageNet2012/tf_records',
builder='records',
split='validation',
image_size=224,
Expand All @@ -52,7 +52,7 @@
epochs=90,
time_history=dict(log_steps=100),
callbacks=dict(enable_checkpoint_and_export=False,
enable_backup_and_restore=True))
enable_backup_and_restore=False))
evaluation = dict(epochs_between_evals=1)

# local_rank for distributed training on gpus
Expand Down
1 change: 1 addition & 0 deletions training/run_benchmarks/config/test_conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@

# "wav2vec2:pytorch_1.13:A100:1:8:1": "/raid/dataset/wav2vec2_data/LibriSpeech",
# "WaveGlow:pytorch_1.13:A100:1:8:1": "/raid/dataset/LJSpeech/",
"resnet50:tensorflow2:A100:1:8:1": "/raid/dataset/ImageNet2012/tf_records/",

# "distilbert:pytorch_1.12:A100:1:8:1": "/raid/dataset/distilbert/",

Expand Down
Loading