diff --git a/training/benchmarks/driver/base.py b/training/benchmarks/driver/base.py index ec255283a..db848c9f1 100644 --- a/training/benchmarks/driver/base.py +++ b/training/benchmarks/driver/base.py @@ -56,6 +56,10 @@ def setup_config(self, parser): if known_args.extern_module_dir: mod_util.install_extern_modules(known_args.extern_module_dir, self.extern_modules) + + if not hasattr(self.config, "local_rank"): + self.config.local_rank = 0 + self.logger = perf_logger.PerfLogger.get_default_logger( rank=self.config.local_rank) diff --git a/training/benchmarks/driver/config_manager.py b/training/benchmarks/driver/config_manager.py index 119e5c341..a928747db 100755 --- a/training/benchmarks/driver/config_manager.py +++ b/training/benchmarks/driver/config_manager.py @@ -157,7 +157,7 @@ def activate(base_config, enable_extern_config) # TODO:后续考虑换一个更优雅的方式 - if "tensorflow2" in base_config.__path__: + if hasattr(base_config, "__path__") and "tensorflow2" in base_config.__path__: base_config.override(parsed_params.__dict__, False) else: _merge_dict_to_config(parsed_params.__dict__, base_config.__dict__) diff --git a/training/benchmarks/resnet50/tensorflow2/configs/configs.py b/training/benchmarks/resnet50/tensorflow2/configs/configs.py index 5bf5c5960..fba248e46 100644 --- a/training/benchmarks/resnet50/tensorflow2/configs/configs.py +++ b/training/benchmarks/resnet50/tensorflow2/configs/configs.py @@ -18,6 +18,7 @@ from core import dataset_factory from . import base_configs from resnet import resnet_config +from absl import flags @dataclasses.dataclass @@ -36,11 +37,11 @@ class ResNetImagenetConfig(base_configs.ExperimentConfig): mean_subtract=True, standardize=True) train: base_configs.TrainConfig = base_configs.TrainConfig( - resume_checkpoint=True, + resume_checkpoint=False, epochs=90, steps=None, callbacks=base_configs.CallbacksConfig( - enable_checkpoint_and_export=True, enable_tensorboard=True), + enable_checkpoint_and_export=False, enable_tensorboard=True), metrics=['accuracy', 'top_5'], time_history=base_configs.TimeHistoryConfig(log_steps=100), tensorboard=base_configs.TensorBoardConfig(track_lr=True, @@ -49,14 +50,29 @@ class ResNetImagenetConfig(base_configs.ExperimentConfig): evaluation: base_configs.EvalConfig = base_configs.EvalConfig( epochs_between_evals=1, steps=None) model: base_configs.ModelConfig = resnet_config.ResNetModelConfig() + do_train: str = True + target_accuracy: float = 0.76 -def get_config(model: str, dataset: str) -> base_configs.ExperimentConfig: +def get_config(flags_obj: flags.FlagValues, model: str, dataset: str) -> base_configs.ExperimentConfig: """Given model and dataset names, return the ExperimentConfig.""" + resnet_config = ResNetImagenetConfig( mode="train_and_eval", + model_dir="result", + train_dataset=dataset_factory.ImageNetConfig( + split='train', + data_dir = flags_obj.data_dir, + one_hot=False, + mean_subtract=True, + standardize=True), + validation_dataset=dataset_factory.ImageNetConfig( + split='validation', + data_dir = flags_obj.data_dir, + one_hot=False, + mean_subtract=True, + standardize=True), + ) dataset_model_config_map = { - 'imagenet': { - 'resnet': ResNetImagenetConfig(), - } + 'imagenet': { 'resnet': resnet_config } } try: return dataset_model_config_map[dataset][model] diff --git a/training/benchmarks/resnet50/tensorflow2/core/trainer_adapter.py b/training/benchmarks/resnet50/tensorflow2/core/trainer_adapter.py index 52d01e144..74a4f5c0c 100644 --- a/training/benchmarks/resnet50/tensorflow2/core/trainer_adapter.py +++ b/training/benchmarks/resnet50/tensorflow2/core/trainer_adapter.py @@ -16,7 +16,7 @@ def get_models() -> Mapping[str, tf.keras.Model]: def convert_model(params): model_params = params.model.model_params.as_dict() - model = get_models()[params.model.name](**model_params) + model = get_models()[params.model.name.lower()](**model_params) return model diff --git a/training/benchmarks/resnet50/tensorflow2/resnet/common.py b/training/benchmarks/resnet50/tensorflow2/resnet/common.py index c4e48104a..40b8e27fb 100644 --- a/training/benchmarks/resnet50/tensorflow2/resnet/common.py +++ b/training/benchmarks/resnet50/tensorflow2/resnet/common.py @@ -192,6 +192,7 @@ def build_stats(history, eval_output, callbacks): # timestamp_log = callback.timestamp_log # stats['step_timestamp_log'] = timestamp_log stats['train_finish_time'] = callback.train_finish_time + stats['num_trained_samples'] = callback.num_trained_samples if callback.epoch_runtime_log: stats[ 'avg_exp_per_second'] = callback.average_examples_per_second diff --git a/training/benchmarks/resnet50/tensorflow2/run_pretraining.py b/training/benchmarks/resnet50/tensorflow2/run_pretraining.py index 1e144c537..28bf0620f 100644 --- a/training/benchmarks/resnet50/tensorflow2/run_pretraining.py +++ b/training/benchmarks/resnet50/tensorflow2/run_pretraining.py @@ -38,7 +38,6 @@ from configs import base_configs from configs import configs from resnet import common -from resnet import resnet_model from modeling import hyperparams from modeling import performance from core import trainer_adapter @@ -47,7 +46,6 @@ logger = None - def get_dtype_map() -> Mapping[str, tf.dtypes.DType]: """Returns the mapping from dtype string representations to TF dtypes.""" return { @@ -140,24 +138,23 @@ def check_must_envconfigs(params): ", ".join(must_configs)) params.local_rank = int(os.environ['FLAGPERF_NODE_RANK']) params.runtime.num_gpus = int(os.environ["FLAGPERF_NPROC"]) + if params.runtime.distribution_strategy == 'multi_worker_mirrored': hosts = os.environ["FLAGPERF_HOSTS"].split(",") ports = os.environ["FLAGPERF_HOSTS_PORTS"].split(",") params.runtime.worker_hosts = ",".join( - [hosts[i] + ":" + ports[i] for i in range(len(hosts))]) + [hosts[i] + ":" + ports[0] for i in range(len(hosts))]) params.runtime.task_index = int(os.environ['FLAGPERF_NODE_RANK']) return params - def _get_params_from_flags(flags_obj: flags.FlagValues): """Get ParamsDict from flags.""" global logger pp = pprint.PrettyPrinter() - - params = configs.get_config(model='resnet', dataset='imagenet') - logging.info('Base params: %s', pp.pformat(params.as_dict())) + params = configs.get_config(flags_obj, model='resnet', dataset='imagenet') + logging.info('_get_params_from_flags Base params: %s', pp.pformat(params.as_dict())) driver = Driver(params, []) driver.setup_config(argparse.ArgumentParser("renset50")) @@ -230,7 +227,6 @@ def initialize(params: base_configs.ExperimentConfig, if params.runtime.batchnorm_spatial_persistent: os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1' - def define_classifier_flags(): """Defines common flags for image classification.""" hyperparams_flags.initialize_common_flags() @@ -267,7 +263,9 @@ def define_classifier_flags(): flags.DEFINE_string('extern_module_dir', default=None, help='The extern module dir.') - + flags.DEFINE_string('vendor', + default=None, + help='AI chip vendor') def serialize_config(params: base_configs.ExperimentConfig, model_dir: str): """Serializes and saves the experiment config.""" @@ -341,7 +339,8 @@ def train_and_eval(params: base_configs.ExperimentConfig, steps_per_execution=steps_per_loop) initial_epoch = 0 - if params.train.resume_checkpoint: + print("params.train.resume_checkpoint", params.train.resume_checkpoint) + if hasattr(params.train, "resume_checkpoint") and params.train.resume_checkpoint: initial_epoch = resume_from_checkpoint( model=model, model_dir=params.model_ckpt_dir, @@ -374,9 +373,12 @@ def train_and_eval(params: base_configs.ExperimentConfig, driver.event(Event.INIT_END) init_end_time = logger.previous_log_time params.init_time = (init_end_time - init_start_time) / 1e+3 - - raw_train_start_time = logger.previous_log_time + + # train + stats_training = dict() + raw_train_start_time = time.time() driver.event(Event.TRAIN_START) + # model.fit()一次性加载整个数据集到内存中,适用于数据集较小,不会导致内存溢出的情况。 history = model.fit(train_dataset, epochs=train_epochs, steps_per_epoch=train_steps, @@ -384,9 +386,11 @@ def train_and_eval(params: base_configs.ExperimentConfig, callbacks=callbacks, verbose=2, **validation_kwargs) + stats_training['no_eval_time'] = time.time() - raw_train_start_time driver.event(Event.TRAIN_END) + # evaluate validation_output = None if not params.evaluation.skip_eval: validation_output = model.evaluate(validation_dataset, @@ -395,8 +399,11 @@ def train_and_eval(params: base_configs.ExperimentConfig, # TODO(dankondratyuk): eval and save final test accuracy stats = common.build_stats(history, validation_output, callbacks) - raw_train_end_time = logger.previous_log_time - params.raw_train_time = (raw_train_end_time - raw_train_start_time) / 1e+3 + params.raw_train_time = time.time() - raw_train_start_time + stats['no_eval_time'] = stats_training['no_eval_time'] + stats['raw_train_time'] = params.raw_train_time + stats['pure_compute_time'] = stats_training['no_eval_time'] + return stats, params @@ -427,7 +434,6 @@ def run(flags_obj: flags.FlagValues, """ params, driver = _get_params_from_flags(flags_obj) - if params.mode == 'train_and_eval': return train_and_eval(params, strategy_override, driver) elif params.mode == 'export_only': @@ -446,7 +452,13 @@ def main(_): if params.do_train: finished_info = { "e2e_time": e2e_time, - "training_sequences_per_second": stats['avg_exp_per_second'], + "num_trained_samples": stats['num_trained_samples'], + "train_time": stats['raw_train_time'], + "train_no_eval_time": stats['no_eval_time'], + "pure_training_computing_time": stats['pure_compute_time'], + "throughput(ips)_raw": stats['num_trained_samples'] / stats['raw_train_time'], + "throughput(ips)_no_eval": stats['num_trained_samples'] / stats['no_eval_time'], + "throughput(ips)_pure_compute": stats['num_trained_samples'] / stats['pure_compute_time'], "converged": stats["converged"], "final_accuracy": stats["accuracy_top_1"], "final_loss": stats["eval_loss"], diff --git a/training/benchmarks/resnet50/tensorflow2/utils/misc/keras_utils.py b/training/benchmarks/resnet50/tensorflow2/utils/misc/keras_utils.py index cf7b2f65e..708795153 100644 --- a/training/benchmarks/resnet50/tensorflow2/utils/misc/keras_utils.py +++ b/training/benchmarks/resnet50/tensorflow2/utils/misc/keras_utils.py @@ -86,6 +86,11 @@ def global_steps(self): """The current 1-indexed global step.""" return self.steps_before_epoch + self.steps_in_epoch + @property + def num_trained_samples(self): + """number of trained samples""" + return self.global_steps * self.batch_size + @property def average_steps_per_second(self): """The average training steps per second across all epochs.""" diff --git a/training/nvidia/resnet50-tensorflow2/README.md b/training/nvidia/resnet50-tensorflow2/README.md new file mode 100644 index 000000000..3e244960e --- /dev/null +++ b/training/nvidia/resnet50-tensorflow2/README.md @@ -0,0 +1,50 @@ +### 1. 数据集准备 +[数据集准备](../../benchmarks/resnet50/tensorflow2/README.md#2数据集准备) + +### 2. Nvidia GPU配置与运行信息参考 +#### 环境配置 +- ##### 硬件环境 + - 机器型号: NVIDIA DGX A100(40G) + - 加速卡型号: NVIDIA_A100-SXM4-40GB + - CPU型号: AMD EPYC7742-64core@1.5G + - 多机网络类型、带宽: InfiniBand,200Gb/s + +- ##### 软件环境 + - OS版本:Ubuntu 20.04 + - OS kernel版本: 5.4.0-113-generic + - 加速卡驱动版本:470.129.06 + - Docker 版本:20.10.16 + - 训练框架版本:tensorflow 2.6.0+nv + - 依赖软件版本: + - cuda: 11.4 + + +### 运行情况 + +* 通用指标 + +| 指标名称 | 指标值 | 特殊说明 | +| -------------- | ----------------------- | ----------------------------------------------- | +| 任务类别 | 图像分类 | | +| 模型 | resnet50 | | +| 数据集 | ImageNet2012 | | +| 数据精度 | precision,见“性能指标” | 可选fp32/amp/fp16 | +| 超参修改 | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 | +| 硬件设备简称 | nvidia A100 | | +| 硬件存储使用 | mem,见“性能指标” | 通常称为“显存”,单位为GiB | +| 端到端时间 | e2e_time,见“性能指标” | 总时间+Perf初始化等时间 | +| 总吞吐量 | p_whole,见“性能指标” | 实际训练图片数除以总时间(performance_whole) | +| 训练吞吐量 | p_train,见“性能指标” | 不包含每个epoch末尾的评估部分耗时 | +| **计算吞吐量** | **p_core,见“性能指标”** | 不包含数据IO部分的耗时(p_core>=p_train>p_whole) | +| 训练结果 | acc1,见“性能指标” | 单位为top1分类准确率(acc1) | +| 额外修改项 | 无 | | + +* 性能指标 + +| 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | acc1 | mem | +| ------------------- | --------- | ------ | -------- | ------- | ------- | ------ | ----- | --------- | +| A100单机单卡(1x1) | fp32 | / | / | 746.3 | 748.1 | 748.1 | / | 39.3/40.0 | +| A100单机8卡(1x8) | fp32 | / | 18578 | 5175.1 | 5176.7 | 5176.7 | 75.5% | 39.5/40.0 | + + + diff --git a/training/nvidia/resnet50-tensorflow2/config/config_A100x1x1.py b/training/nvidia/resnet50-tensorflow2/config/config_A100x1x1.py new file mode 100644 index 000000000..147aecff1 --- /dev/null +++ b/training/nvidia/resnet50-tensorflow2/config/config_A100x1x1.py @@ -0,0 +1,59 @@ +# Training configuration for ResNet trained on ImageNet on GPUs. +# Reaches > 76.1% within 90 epochs. +# Note = This configuration uses a scaled per-replica batch size based on the number of devices. +# Base params = base_configs.ExperimentConfig +do_train = True +model_dir = 'result' +model_ckpt_dir = ' ' +mode = 'train_and_eval' +target_accuracy: float = 0.76 +# runtime = dict( +# distribution_strategy = 'multi_worker_mirrored', +# run_eagerly = None, +# tpu = None, +# batchnorm_spatial_persistent = True) +runtime = dict(distribution_strategy='mirrored', + run_eagerly=None, + tpu=None, + batchnorm_spatial_persistent=True) +train_dataset = dict(name='imagenet2012', + data_dir='/raid/dataset/ImageNet2012/tf_records', + builder='records', + split='train', + image_size=224, + num_classes=1000, + num_examples=1281167, + batch_size=128, + use_per_replica_batch_size=True, + dtype='float16', + mean_subtrat=True, + standardize=True) +validation_dataset = dict( + name='imagenet2012', + data_dir='/raid/dataset/ImageNet2012/tf_records', + builder='records', + split='validation', + image_size=224, + num_classes=1000, + num_examples=50000, + batch_size=128, #256 + use_per_replica_batch_size=True, + dtype='float16', + mean_subtract=True, + standardize=True) +model = dict(name='resnet', + model_params=dict(rescale_inputs=False), + optimizer=dict(name='momentum', + momentum=0.9, + decay=0.9, + epsilon=0.001), + loss=dict(label_smoothing=0.1)) +train = dict(resume_checkpoint=True, + epochs=90, + time_history=dict(log_steps=100), + callbacks=dict(enable_checkpoint_and_export=False, + enable_backup_and_restore=False)) +evaluation = dict(epochs_between_evals=1) + +# local_rank for distributed training on gpus +local_rank: int = -1 ## for log diff --git a/training/nvidia/resnet50-tensorflow2/config/config_A100x1x8.py b/training/nvidia/resnet50-tensorflow2/config/config_A100x1x8.py index cd12549df..147aecff1 100644 --- a/training/nvidia/resnet50-tensorflow2/config/config_A100x1x8.py +++ b/training/nvidia/resnet50-tensorflow2/config/config_A100x1x8.py @@ -6,7 +6,7 @@ model_dir = 'result' model_ckpt_dir = ' ' mode = 'train_and_eval' -target_accuracy: float = 0.75 +target_accuracy: float = 0.76 # runtime = dict( # distribution_strategy = 'multi_worker_mirrored', # run_eagerly = None, @@ -17,7 +17,7 @@ tpu=None, batchnorm_spatial_persistent=True) train_dataset = dict(name='imagenet2012', - data_dir='/mnt/data/ImageNet2012/tf_records', + data_dir='/raid/dataset/ImageNet2012/tf_records', builder='records', split='train', image_size=224, @@ -30,7 +30,7 @@ standardize=True) validation_dataset = dict( name='imagenet2012', - data_dir='/mnt/data/ImageNet2012/tf_records', + data_dir='/raid/dataset/ImageNet2012/tf_records', builder='records', split='validation', image_size=224, @@ -52,7 +52,7 @@ epochs=90, time_history=dict(log_steps=100), callbacks=dict(enable_checkpoint_and_export=False, - enable_backup_and_restore=True)) + enable_backup_and_restore=False)) evaluation = dict(epochs_between_evals=1) # local_rank for distributed training on gpus diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py index 2af948a72..cee0b9105 100644 --- a/training/run_benchmarks/config/test_conf.py +++ b/training/run_benchmarks/config/test_conf.py @@ -76,6 +76,7 @@ # "wav2vec2:pytorch_1.13:A100:1:8:1": "/raid/dataset/wav2vec2_data/LibriSpeech", # "WaveGlow:pytorch_1.13:A100:1:8:1": "/raid/dataset/LJSpeech/", + "resnet50:tensorflow2:A100:1:8:1": "/raid/dataset/ImageNet2012/tf_records/", # "distilbert:pytorch_1.12:A100:1:8:1": "/raid/dataset/distilbert/", diff --git a/training/run_benchmarks/tensorflow2/start_tensorflow2_task.py b/training/run_benchmarks/tensorflow2/start_tensorflow2_task.py index c8b8f47aa..4dc593cd0 100644 --- a/training/run_benchmarks/tensorflow2/start_tensorflow2_task.py +++ b/training/run_benchmarks/tensorflow2/start_tensorflow2_task.py @@ -145,11 +145,12 @@ def _get_basic_train_script_args(task_args): extern_module_dir = helper.get_extern_module_dir(task_args) basic_train_script_args = " --data_dir " + task_args.data_dir \ + " --extern_config_dir " + config_dir \ - + " --extern_config_file " + config_file + + " --extern_config_file " + config_file \ + + " --vendor " + task_args.vendor if extern_module_dir is not None and task_args.enable_extern_config: basic_train_script_args += " --enable_extern_config " \ - + "--extern_module_dir " + extern_module_dir + + " --extern_module_dir " + extern_module_dir return basic_train_script_args