Skip to content

Commit

Permalink
refine resnet50-tf2, bugfix and add porformance metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
zhouyu committed Jan 9, 2024
1 parent 9f821a5 commit 2d6554c
Show file tree
Hide file tree
Showing 12 changed files with 76 additions and 29 deletions.
5 changes: 5 additions & 0 deletions training/benchmarks/driver/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def setup_config(self, parser):
required=True,
help="The accelerator vendor that run the located.")
known_args, unknown_args = parser.parse_known_args()

config_manager.activate(self.config, self.mutable_params,
known_args.extern_config_dir,
known_args.extern_config_file,
Expand All @@ -56,6 +57,10 @@ def setup_config(self, parser):
if known_args.extern_module_dir:
mod_util.install_extern_modules(known_args.extern_module_dir,
self.extern_modules)

if not hasattr(self.config, "local_rank"):
self.config.local_rank = 0

self.logger = perf_logger.PerfLogger.get_default_logger(
rank=self.config.local_rank)

Expand Down
28 changes: 22 additions & 6 deletions training/benchmarks/resnet50/tensorflow2/configs/configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from core import dataset_factory
from . import base_configs
from resnet import resnet_config
from absl import flags


@dataclasses.dataclass
Expand All @@ -36,11 +37,11 @@ class ResNetImagenetConfig(base_configs.ExperimentConfig):
mean_subtract=True,
standardize=True)
train: base_configs.TrainConfig = base_configs.TrainConfig(
resume_checkpoint=True,
resume_checkpoint=False,
epochs=90,
steps=None,
callbacks=base_configs.CallbacksConfig(
enable_checkpoint_and_export=True, enable_tensorboard=True),
enable_checkpoint_and_export=False, enable_tensorboard=True),
metrics=['accuracy', 'top_5'],
time_history=base_configs.TimeHistoryConfig(log_steps=100),
tensorboard=base_configs.TensorBoardConfig(track_lr=True,
Expand All @@ -49,14 +50,29 @@ class ResNetImagenetConfig(base_configs.ExperimentConfig):
evaluation: base_configs.EvalConfig = base_configs.EvalConfig(
epochs_between_evals=1, steps=None)
model: base_configs.ModelConfig = resnet_config.ResNetModelConfig()
do_train: str = True
target_accuracy: float = 0.1


def get_config(model: str, dataset: str) -> base_configs.ExperimentConfig:
def get_config(flags_obj: flags.FlagValues, model: str, dataset: str) -> base_configs.ExperimentConfig:
"""Given model and dataset names, return the ExperimentConfig."""
resnet_config = ResNetImagenetConfig( mode="train_and_eval",
model_dir="result",
train_dataset=dataset_factory.ImageNetConfig(
split='train',
data_dir = flags_obj.data_dir,
one_hot=False,
mean_subtract=True,
standardize=True),
validation_dataset=dataset_factory.ImageNetConfig(
split='validation',
data_dir = flags_obj.data_dir,
one_hot=False,
mean_subtract=True,
standardize=True),
)
dataset_model_config_map = {
'imagenet': {
'resnet': ResNetImagenetConfig(),
}
'imagenet': { 'resnet': resnet_config }
}
try:
return dataset_model_config_map[dataset][model]
Expand Down
1 change: 1 addition & 0 deletions training/benchmarks/resnet50/tensorflow2/core/callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ def __init__(self, target_accuracy):
self.converged = False

def on_batch_end(self, batch, logs=None):
# {'loss': 12.375970840454102, 'accuracy': 0.0009765625, 'top_5_accuracy': 0.005859375}
if logs["accuracy"] >= self.target_accuracy:
self.converged = True
self.model.stop_training = True
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ class RuntimeConfig(base_config.Config):
batchnorm_spatial_persistent: Whether or not to enable the spatial
persistent mode for CuDNN batch norm kernel for improved GPU performance.
"""
distribution_strategy: str = "mirrored"
distribution_strategy: str = "multi_worker_mirrored"
enable_xla: bool = False
gpu_thread_mode: Optional[str] = None
dataset_num_private_threads: Optional[int] = None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from __future__ import print_function
import dataclasses
import os
import pprint
from typing import Any, List, Mapping, Optional, Tuple, Union

from absl import logging
Expand Down Expand Up @@ -370,7 +371,8 @@ def load_records(self) -> tf.data.Dataset:

file_pattern = os.path.join(self.config.data_dir,
self.config.split,
'{}*'.format(self.config.split))
'{}*'.format(self.config.split)
)
dataset = tf.data.Dataset.list_files(file_pattern, shuffle=False)
else:
dataset = tf.data.Dataset.from_tensor_slices(self.config.filenames)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def get_models() -> Mapping[str, tf.keras.Model]:

def convert_model(params):
model_params = params.model.model_params.as_dict()
model = get_models()[params.model.name](**model_params)
model = get_models()[params.model.name.lower()](**model_params)
return model


Expand Down
1 change: 1 addition & 0 deletions training/benchmarks/resnet50/tensorflow2/resnet/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ def build_stats(history, eval_output, callbacks):
# timestamp_log = callback.timestamp_log
# stats['step_timestamp_log'] = timestamp_log
stats['train_finish_time'] = callback.train_finish_time
stats['num_trained_samples'] = callback.num_trained_samples
if callback.epoch_runtime_log:
stats[
'avg_exp_per_second'] = callback.average_examples_per_second
Expand Down
45 changes: 30 additions & 15 deletions training/benchmarks/resnet50/tensorflow2/run_pretraining.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@
from configs import base_configs
from configs import configs
from resnet import common
from resnet import resnet_model
from modeling import hyperparams
from modeling import performance
from core import trainer_adapter
Expand All @@ -47,7 +46,6 @@

logger = None


def get_dtype_map() -> Mapping[str, tf.dtypes.DType]:
"""Returns the mapping from dtype string representations to TF dtypes."""
return {
Expand Down Expand Up @@ -140,24 +138,24 @@ def check_must_envconfigs(params):
", ".join(must_configs))
params.local_rank = int(os.environ['FLAGPERF_NODE_RANK'])
params.runtime.num_gpus = int(os.environ["FLAGPERF_NPROC"])

if params.runtime.distribution_strategy == 'multi_worker_mirrored':
hosts = os.environ["FLAGPERF_HOSTS"].split(",")
ports = os.environ["FLAGPERF_HOSTS_PORTS"].split(",")
params.runtime.worker_hosts = ",".join(
[hosts[i] + ":" + ports[i] for i in range(len(hosts))])
[hosts[i] + ":" + ports[0] for i in range(len(hosts))])
params.runtime.task_index = int(os.environ['FLAGPERF_NODE_RANK'])

return params


# TODO. config.data_dir 为空!
def _get_params_from_flags(flags_obj: flags.FlagValues):
"""Get ParamsDict from flags."""

global logger
pp = pprint.PrettyPrinter()

params = configs.get_config(model='resnet', dataset='imagenet')
logging.info('Base params: %s', pp.pformat(params.as_dict()))
params = configs.get_config(flags_obj, model='resnet', dataset='imagenet')
logging.info('_get_params_from_flags Base params: %s', pp.pformat(params.as_dict()))

driver = Driver(params, [])
driver.setup_config(argparse.ArgumentParser("renset50"))
Expand Down Expand Up @@ -230,7 +228,6 @@ def initialize(params: base_configs.ExperimentConfig,
if params.runtime.batchnorm_spatial_persistent:
os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'


def define_classifier_flags():
"""Defines common flags for image classification."""
hyperparams_flags.initialize_common_flags()
Expand Down Expand Up @@ -267,6 +264,10 @@ def define_classifier_flags():
flags.DEFINE_string('extern_module_dir',
default=None,
help='The extern module dir.')
flags.DEFINE_string('vendor',
default=None,
help='AI chip vendor')
print("flags", flags)


def serialize_config(params: base_configs.ExperimentConfig, model_dir: str):
Expand Down Expand Up @@ -341,7 +342,8 @@ def train_and_eval(params: base_configs.ExperimentConfig,
steps_per_execution=steps_per_loop)

initial_epoch = 0
if params.train.resume_checkpoint:
print("params.train.resume_checkpoint", params.train.resume_checkpoint)
if hasattr(params.train, "resume_checkpoint") and params.train.resume_checkpoint:
initial_epoch = resume_from_checkpoint(
model=model,
model_dir=params.model_ckpt_dir,
Expand Down Expand Up @@ -374,19 +376,24 @@ def train_and_eval(params: base_configs.ExperimentConfig,
driver.event(Event.INIT_END)
init_end_time = logger.previous_log_time
params.init_time = (init_end_time - init_start_time) / 1e+3

raw_train_start_time = logger.previous_log_time

# train
stats_training = dict()
raw_train_start_time = time.time()
driver.event(Event.TRAIN_START)
# model.fit()一次性加载整个数据集到内存中,适用于数据集较小,不会导致内存溢出的情况。
history = model.fit(train_dataset,
epochs=train_epochs,
steps_per_epoch=train_steps,
initial_epoch=initial_epoch,
callbacks=callbacks,
verbose=2,
**validation_kwargs)
stats_training['no_eval_time'] = time.time() - raw_train_start_time

driver.event(Event.TRAIN_END)

# evaluate
validation_output = None
if not params.evaluation.skip_eval:
validation_output = model.evaluate(validation_dataset,
Expand All @@ -395,8 +402,11 @@ def train_and_eval(params: base_configs.ExperimentConfig,

# TODO(dankondratyuk): eval and save final test accuracy
stats = common.build_stats(history, validation_output, callbacks)
raw_train_end_time = logger.previous_log_time
params.raw_train_time = (raw_train_end_time - raw_train_start_time) / 1e+3
params.raw_train_time = time.time() - raw_train_start_time
stats['no_eval_time'] = stats_training['no_eval_time']
stats['raw_train_time'] = params.raw_train_time
stats['pure_compute_time'] = stats_training['no_eval_time']

return stats, params


Expand Down Expand Up @@ -427,7 +437,6 @@ def run(flags_obj: flags.FlagValues,
"""

params, driver = _get_params_from_flags(flags_obj)

if params.mode == 'train_and_eval':
return train_and_eval(params, strategy_override, driver)
elif params.mode == 'export_only':
Expand All @@ -446,7 +455,13 @@ def main(_):
if params.do_train:
finished_info = {
"e2e_time": e2e_time,
"training_sequences_per_second": stats['avg_exp_per_second'],
"num_trained_samples": stats['num_trained_samples'],
"train_time": stats['raw_train_time'],
"train_no_eval_time": stats['no_eval_time'],
"pure_training_computing_time": stats['pure_compute_time'],
"throughput(ips)_raw": stats['num_trained_samples'] / stats['raw_train_time'],
"throughput(ips)_no_eval": stats['num_trained_samples'] / stats['no_eval_time'],
"throughput(ips)_pure_compute": stats['num_trained_samples'] / stats['pure_compute_time'],
"converged": stats["converged"],
"final_accuracy": stats["accuracy_top_1"],
"final_loss": stats["eval_loss"],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,11 @@ def global_steps(self):
"""The current 1-indexed global step."""
return self.steps_before_epoch + self.steps_in_epoch

@property
def num_trained_samples(self):
"""number of trained samples"""
return self.global_steps * self.batch_size

@property
def average_steps_per_second(self):
"""The average training steps per second across all epochs."""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
tpu=None,
batchnorm_spatial_persistent=True)
train_dataset = dict(name='imagenet2012',
data_dir='/mnt/data/ImageNet2012/tf_records',
data_dir='/raid/dataset/ImageNet2012/tf_records',
builder='records',
split='train',
image_size=224,
Expand All @@ -30,7 +30,7 @@
standardize=True)
validation_dataset = dict(
name='imagenet2012',
data_dir='/mnt/data/ImageNet2012/tf_records',
data_dir='/raid/dataset/ImageNet2012/tf_records',
builder='records',
split='validation',
image_size=224,
Expand All @@ -52,7 +52,7 @@
epochs=90,
time_history=dict(log_steps=100),
callbacks=dict(enable_checkpoint_and_export=False,
enable_backup_and_restore=True))
enable_backup_and_restore=False))
evaluation = dict(epochs_between_evals=1)

# local_rank for distributed training on gpus
Expand Down
1 change: 1 addition & 0 deletions training/run_benchmarks/config/test_conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@

# "wav2vec2:pytorch_1.13:A100:1:8:1": "/raid/dataset/wav2vec2_data/LibriSpeech",
# "WaveGlow:pytorch_1.13:A100:1:8:1": "/raid/dataset/LJSpeech/",
"resnet50:tensorflow2:A100:1:8:1": "/raid/dataset/ImageNet2012/tf_records/",

# "distilbert:pytorch_1.12:A100:1:8:1": "/raid/dataset/distilbert/",

Expand Down
5 changes: 3 additions & 2 deletions training/run_benchmarks/tensorflow2/start_tensorflow2_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,11 +145,12 @@ def _get_basic_train_script_args(task_args):
extern_module_dir = helper.get_extern_module_dir(task_args)
basic_train_script_args = " --data_dir " + task_args.data_dir \
+ " --extern_config_dir " + config_dir \
+ " --extern_config_file " + config_file
+ " --extern_config_file " + config_file \
+ " --vendor " + task_args.vendor

if extern_module_dir is not None and task_args.enable_extern_config:
basic_train_script_args += " --enable_extern_config " \
+ "--extern_module_dir " + extern_module_dir
+ " --extern_module_dir " + extern_module_dir
return basic_train_script_args


Expand Down

0 comments on commit 2d6554c

Please sign in to comment.