From 983b725c12a7032481d039d6190764816e0e1ddb Mon Sep 17 00:00:00 2001 From: Yuhao Wang <45590791+cnstark@users.noreply.github.com> Date: Tue, 25 Oct 2022 18:14:20 +0800 Subject: [PATCH] feature: mlu (#70) * feature: support MLU * fix * fix * fix: env * fix * fix: to_device * move device * fix * fix: launch_runner --- easytorch/__init__.py | 3 +- easytorch/core/checkpoint.py | 16 ++-- easytorch/core/runner.py | 25 +------ easytorch/device.py | 74 +++++++++++++++++++ easytorch/entry_points/easytrain.py | 4 +- easytorch/launcher/dist_wrap.py | 33 +++++---- easytorch/launcher/launcher.py | 35 ++++++--- easytorch/utils/__init__.py | 10 +-- easytorch/utils/dist.py | 3 +- easytorch/utils/env.py | 73 ++++++++++-------- easytorch/version.py | 2 +- .../imagenet/configs/resnet50_8x_mlu_cfg.py | 66 +++++++++++++++++ examples/imagenet/imagenet_runner.py | 11 +-- examples/imagenet/validate.py | 5 +- .../linear_regression_runner.py | 7 +- examples/mnist/mnist_runner.py | 11 +-- examples/mnist/validate.py | 5 +- tests/random_test/random_test.py | 2 +- 18 files changed, 271 insertions(+), 114 deletions(-) create mode 100644 easytorch/device.py create mode 100644 examples/imagenet/configs/resnet50_8x_mlu_cfg.py diff --git a/easytorch/__init__.py b/easytorch/__init__.py index 06703ed..5b0d9e0 100644 --- a/easytorch/__init__.py +++ b/easytorch/__init__.py @@ -4,6 +4,5 @@ from .version import __version__ __all__ = [ - 'Config', 'import_config', 'Runner', 'Runner', 'AvgMeter', 'MeterPool', 'launch_runner', - 'launch_training', '__version__' + 'Config', 'import_config', 'Runner', 'AvgMeter', 'MeterPool', 'launch_runner', 'launch_training', '__version__' ] diff --git a/easytorch/core/checkpoint.py b/easytorch/core/checkpoint.py index 41809a1..00aa049 100644 --- a/easytorch/core/checkpoint.py +++ b/easytorch/core/checkpoint.py @@ -7,6 +7,8 @@ import torch from ..utils import get_logger, get_local_rank +from ..device import get_device_type + DEFAULT_LOGGER = get_logger('easytorch-checkpoint') @@ -28,8 +30,7 @@ def get_last_ckpt_path(ckpt_save_dir: str, name_pattern: str = r'^.+_[\d]*.pt$') return os.path.join(ckpt_save_dir, ckpt_list[-1]) -def load_ckpt(ckpt_save_dir: str, ckpt_path: str = None, use_gpu: bool = True, - logger: Logger = DEFAULT_LOGGER) -> Dict: +def load_ckpt(ckpt_save_dir: str, ckpt_path: str = None, logger: Logger = DEFAULT_LOGGER) -> Dict: """Load checkpoint if param `ckpt_path` is None, load the last checkpoint in `ckpt_save_dir`, else load checkpoint from `ckpt_path` @@ -37,7 +38,6 @@ def load_ckpt(ckpt_save_dir: str, ckpt_path: str = None, use_gpu: bool = True, Args: ckpt_save_dir (str): checkpoint save directory ckpt_path (str): checkpoint path, default is None - use_gpu (bool): set to ``True`` to load checkpoint to GPU logger (Logger): logger, default is Logger('easytorch') Returns: @@ -46,10 +46,12 @@ def load_ckpt(ckpt_save_dir: str, ckpt_path: str = None, use_gpu: bool = True, if ckpt_path is None: ckpt_path = get_last_ckpt_path(ckpt_save_dir) - if use_gpu: - map_location = 'cuda:{}'.format(get_local_rank()) - else: - map_location = 'cpu' + map_location = { + 'gpu': 'cuda:{}'.format(get_local_rank()), + 'mlu': None, + 'cpu': 'cpu' + }[get_device_type()] + logger.info('Loading Checkpoint from \'{}\''.format(ckpt_path)) return torch.load(ckpt_path, map_location=map_location) diff --git a/easytorch/core/runner.py b/easytorch/core/runner.py index acc4c94..caf6565 100644 --- a/easytorch/core/runner.py +++ b/easytorch/core/runner.py @@ -18,6 +18,7 @@ from .optimizer_builder import build_optim, build_lr_scheduler from ..config import Config, get_ckpt_save_dir from ..utils import TimePredictor, get_logger, get_local_rank, is_master, master_only, set_env +from ..device import to_device class Runner(metaclass=ABCMeta): @@ -32,7 +33,6 @@ def __init__(self, cfg: Config): set_env(cfg.get('ENV', {})) # param - self.use_gpu = cfg.get('GPU_NUM', 0) != 0 self.model_name = cfg['MODEL.NAME'] self.ckpt_save_dir = get_ckpt_save_dir(cfg) self.logger.info('Set ckpt save dir: \'{}\''.format(self.ckpt_save_dir)) @@ -86,22 +86,6 @@ def init_logger(self, logger: logging.Logger = None, logger_name: str = None, else: raise TypeError('At least one of logger and logger_name is not None') - def to_running_device(self, src: Union[torch.Tensor, nn.Module]) -> Union[torch.Tensor, nn.Module]: - """Move `src` to the running device. If `self.use_gpu` is ```True```, - the running device is GPU, else the running device is CPU. - - Args: - src (Union[torch.Tensor, nn.Module]): source - - Returns: - target (Union[torch.Tensor, nn.Module]) - """ - - if self.use_gpu: - return src.cuda() - else: - return src.cpu() - @staticmethod @abstractmethod def define_model(cfg: Config) -> nn.Module: @@ -198,7 +182,7 @@ def build_model(self, cfg: Config) -> nn.Module: self.logger.info('Building model.') model = self.define_model(cfg) - model = self.to_running_device(model) + model = to_device(model) if torch.distributed.is_initialized(): model = DDP( model, @@ -273,7 +257,7 @@ def load_model_resume(self, strict: bool = True): """ try: - checkpoint_dict = load_ckpt(self.ckpt_save_dir, use_gpu=self.use_gpu, logger=self.logger) + checkpoint_dict = load_ckpt(self.ckpt_save_dir, logger=self.logger) if isinstance(self.model, DDP): self.model.module.load_state_dict(checkpoint_dict['model_state_dict'], strict=strict) else: @@ -301,8 +285,7 @@ def load_model(self, ckpt_path: str = None, strict: bool = True): """ try: - checkpoint_dict = load_ckpt(self.ckpt_save_dir, ckpt_path=ckpt_path, use_gpu=self.use_gpu, - logger=self.logger) + checkpoint_dict = load_ckpt(self.ckpt_save_dir, ckpt_path=ckpt_path, logger=self.logger) if isinstance(self.model, DDP): self.model.module.load_state_dict(checkpoint_dict['model_state_dict'], strict=strict) else: diff --git a/easytorch/device.py b/easytorch/device.py new file mode 100644 index 0000000..7db5654 --- /dev/null +++ b/easytorch/device.py @@ -0,0 +1,74 @@ +from typing import Union + +import torch +from torch import nn + +__all__ = [ + 'get_device_type', 'set_device_type', 'get_device_count', 'set_device', 'to_device', 'set_device_manual_seed' +] + +_DEVICE_TYPE = 'gpu' + + +def get_device_type() -> str: + return _DEVICE_TYPE + + +def set_device_type(device_type: str): + global _DEVICE_TYPE + if device_type not in ['gpu', 'mlu', 'cpu']: + raise ValueError('Unknown device type!') + if device_type == 'mlu': + __import__('torch_mlu') + _DEVICE_TYPE = device_type + + +def get_device_count() -> int: + if _DEVICE_TYPE == 'gpu': + return torch.cuda.device_count() + elif _DEVICE_TYPE == 'mlu': + torch_mlu = __import__('torch_mlu') + return torch_mlu.mlu_model.device_count() + elif _DEVICE_TYPE == 'cpu': + return 0 + else: + raise ValueError('Unknown device type!') + + +def set_device(device_id: int): + if _DEVICE_TYPE == 'gpu': + torch.cuda.set_device(device_id) + elif _DEVICE_TYPE == 'mlu': + torch_mlu = __import__('torch_mlu') + torch_mlu.mlu_model.set_device(device_id) + else: + raise ValueError('Unknown device type!') + + +def to_device(src: Union[torch.Tensor, nn.Module], device_id: int = None) -> Union[torch.Tensor, nn.Module]: + if _DEVICE_TYPE == 'gpu': + if device_id is None: + return src.cuda() + else: + return src.to('cuda:{:d}'.format(device_id)) + elif _DEVICE_TYPE == 'mlu': + __import__('torch_mlu') + if device_id is None: + return src.mlu() + else: + return src.to('mlu:{:d}'.format(device_id)) + elif _DEVICE_TYPE == 'cpu': + return src.cpu() + else: + raise ValueError('Unknown device type!') + + +def set_device_manual_seed(seed: int): + torch.manual_seed(seed) + if _DEVICE_TYPE == 'gpu': + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + elif _DEVICE_TYPE == 'mlu': + torch_mlu = __import__('torch_mlu') + torch_mlu.mlu_model.manual_seed(seed) + torch_mlu.mlu_model.manual_seed_all(seed) diff --git a/easytorch/entry_points/easytrain.py b/easytorch/entry_points/easytrain.py index e09e4e2..654b1e8 100644 --- a/easytorch/entry_points/easytrain.py +++ b/easytorch/entry_points/easytrain.py @@ -9,7 +9,7 @@ def parse_args(): parser = ArgumentParser(description='Welcome to EasyTorch!') parser.add_argument('-c', '--cfg', help='training config', required=True) parser.add_argument('--node-rank', default=0, type=int, help='node rank for distributed training') - parser.add_argument('--gpus', help='visible gpus', type=str) + parser.add_argument('--devices', help='visible devices', type=str) return parser.parse_args() @@ -22,4 +22,4 @@ def easytrain(): args = parse_args() # train - launch_training(args.cfg, args.gpus, args.node_rank) + launch_training(args.cfg, args.devices, args.node_rank) diff --git a/easytorch/launcher/dist_wrap.py b/easytorch/launcher/dist_wrap.py index 1396316..bf1c530 100644 --- a/easytorch/launcher/dist_wrap.py +++ b/easytorch/launcher/dist_wrap.py @@ -5,6 +5,7 @@ import torch from ..utils import get_logger +from ..device import get_device_type, set_device_type, get_device_count, set_device def dist_func(local_rank: int, dist_params: Dict[str, Any], func: Callable, *args): @@ -18,7 +19,7 @@ def dist_func(local_rank: int, dist_params: Dict[str, Any], func: Callable, *arg logger = get_logger('easytorch-launcher') - rank = dist_params['gpu_num'] * dist_params['node_rank'] + local_rank + rank = dist_params['device_num'] * dist_params['node_rank'] + local_rank logger.info( 'Launching in distributed mode. Distributed parameters:'\ 'word_size={:d}, node_rank={:d}, rank={:d}, local_rank={:d}, dist_backend={}, init_method={}'.format( @@ -27,6 +28,8 @@ def dist_func(local_rank: int, dist_params: Dict[str, Any], func: Callable, *arg ) ) + set_device_type(dist_params['device_type']) + torch.distributed.init_process_group( backend=dist_params['dist_backend'], init_method=dist_params['init_method'], @@ -34,7 +37,7 @@ def dist_func(local_rank: int, dist_params: Dict[str, Any], func: Callable, *arg world_size=dist_params['word_size'] ) - torch.cuda.set_device(local_rank) + set_device(local_rank) args, kwargs = args func(*args, **kwargs) @@ -42,7 +45,7 @@ def dist_func(local_rank: int, dist_params: Dict[str, Any], func: Callable, *arg def dist_wrap(func: Callable, node_num: int = 1, - gpu_num: int = 1, + device_num: int = 1, node_rank: int = 0, dist_backend: Optional[Union[str, torch.distributed.Backend]] = None, init_method: Optional[str] = None) -> Callable: @@ -55,7 +58,7 @@ def dist_wrap(func: Callable, >>> function_dist = dist_wrap( >>> function, >>> node_num=node_num, - >>> gpu_num=gpu_num, + >>> device_num=device_num, >>> node_rank=node_rank, >>> dist_backend=dist_backend, >>> init_method=init_method @@ -65,7 +68,7 @@ def dist_wrap(func: Callable, Args: func (Callable): The function. node_num (int, optional): Number of node. Defaults to 1. - gpu_num (int, optional): Number of gpus per node. Defaults to 1. + device_num (int, optional): Number of devices per node. Defaults to 1. node_rank (int, optional): Rank of current node. Defaults to 0. dist_backend (Optional[Union[str, distributed.Backend]], optional): The backend of DDP. Defaults to None, means using `nccl` as the backend. @@ -79,23 +82,22 @@ def dist_wrap(func: Callable, if node_num < 1: raise ValueError('The node_num must be greater than 1!') - if gpu_num < 0: - raise ValueError('The gpu_num must be greater than 0!') + if device_num < 0: + raise ValueError('The device_num must be greater than 0!') - word_size = node_num * gpu_num + word_size = node_num * device_num if word_size == 0: # CPU mode return func else: - # GPU mode + # DEVICE mode if node_rank >= node_num: raise ValueError('The node_rank must be less than dist_node_num!') - if gpu_num != torch.cuda.device_count(): - raise RuntimeError('GPU num not match, cfg.GPU_NUM = {:d}, but torch.cuda.device_count() = {:d}'.format( - gpu_num, torch.cuda.device_count() - )) + if device_num != get_device_count(): + raise RuntimeError('Device num not match, cfg.DEVICE_NUM = {:d}, ' \ + 'but torch.cuda.device_count() = {:d}'.format(device_num, get_device_count())) if word_size == 1: return func @@ -112,7 +114,8 @@ def dist_wrap(func: Callable, @functools.wraps(func) def wrapper(*args, **kwargs): dist_params = { - 'gpu_num': gpu_num, + 'device_type': get_device_type(), + 'device_num': device_num, 'node_rank': node_rank, 'word_size': word_size, 'dist_backend': dist_backend, @@ -122,7 +125,7 @@ def wrapper(*args, **kwargs): torch.multiprocessing.spawn( dist_func, args=(dist_params, func, args, kwargs), - nprocs=gpu_num, + nprocs=device_num, join=True ) diff --git a/easytorch/launcher/launcher.py b/easytorch/launcher/launcher.py index c972efd..f7f9af9 100644 --- a/easytorch/launcher/launcher.py +++ b/easytorch/launcher/launcher.py @@ -2,7 +2,8 @@ from typing import Callable, Dict, Union, Tuple from ..config import init_cfg -from ..utils import set_gpus, get_logger +from ..utils import set_visible_devices, get_logger +from ..device import set_device_type from .dist_wrap import dist_wrap @@ -34,7 +35,7 @@ def training_func(cfg: Dict): raise e -def launch_training(cfg: Union[Dict, str], gpus: str = None, node_rank: int = 0): +def launch_training(cfg: Union[Dict, str], devices: str = None, node_rank: int = 0): """Launch training process defined by `cfg`. Support distributed data parallel training when the number of available GPUs is greater than one. @@ -48,7 +49,7 @@ def launch_training(cfg: Union[Dict, str], gpus: str = None, node_rank: int = 0) Args: cfg (Union[Dict, str]): Easytorch config. - gpus (str): set ``CUDA_VISIBLE_DEVICES`` environment variable. + devices (str): set ``CUDA_VISIBLE_DEVICES`` environment variable. node_rank (int): Rank of the current node. """ @@ -57,13 +58,27 @@ def launch_training(cfg: Union[Dict, str], gpus: str = None, node_rank: int = 0) cfg = init_cfg(cfg, node_rank == 0) - if cfg.get('GPU_NUM', 0) != 0: - set_gpus(gpus) + if cfg.get('DEVICE') is not None: + set_device_type(cfg['DEVICE']) + device_num = cfg.get('DEVICE_NUM', 0) + elif cfg.get('GPU_NUM', 0) != 0 or cfg.get('MLU_NUM', 0) != 0: + if cfg.get('GPU_NUM', 0) != 0 and cfg.get('MLU_NUM', 0) == 0: + set_device_type('gpu') + device_num = cfg.get('GPU_NUM', 0) + elif cfg.get('GPU_NUM', 0) == 0 and cfg.get('MLU_NUM', 0) != 0: + set_device_type('mlu') + device_num = cfg.get('MLU_NUM', 0) + else: + raise ValueError('At least one of `CFG.GPU_NUM` and `CFG.MLU_NUM` is 0.') + set_visible_devices(devices) + else: + set_device_type('cpu') + device_num = 0 train_dist = dist_wrap( training_func, node_num=cfg.get('DIST_NODE_NUM', 1), - gpu_num=cfg.get('GPU_NUM', 0), + device_num=device_num, node_rank=node_rank, dist_backend=cfg.get('DIST_BACKEND'), init_method=cfg.get('DIST_INIT_METHOD') @@ -71,7 +86,7 @@ def launch_training(cfg: Union[Dict, str], gpus: str = None, node_rank: int = 0) train_dist(cfg) -def launch_runner(cfg: Union[Dict, str], fn: Callable, args: Tuple = (), gpus: str = None): +def launch_runner(cfg: Union[Dict, str], fn: Callable, args: Tuple = (), device_type: str = 'gpu', devices: str = None): """Launch runner defined by `cfg`, and call `fn`. Args: @@ -89,8 +104,10 @@ def launch_runner(cfg: Union[Dict, str], fn: Callable, args: Tuple = (), gpus: s cfg = init_cfg(cfg, True) - if cfg.get('GPU_NUM', 0) != 0: - set_gpus(gpus) + set_device_type(device_type) + + if device_type != 'cpu': + set_visible_devices(devices) # init runner runner = cfg['RUNNER'](cfg) diff --git a/easytorch/utils/__init__.py b/easytorch/utils/__init__.py index 9732249..d9b2df3 100644 --- a/easytorch/utils/__init__.py +++ b/easytorch/utils/__init__.py @@ -1,12 +1,10 @@ -from .env import set_gpus, set_tf32_mode, setup_determinacy, set_env -from .timer import Timer, TimePredictor +from .env import set_visible_devices, set_tf32_mode, setup_determinacy, set_env from .dist import get_rank, get_local_rank, get_world_size, is_rank, is_master, master_only from .logging import get_logger from .named_hook import NamedForwardHook, NamedBackwardHook - +from .timer import Timer, TimePredictor __all__ = [ - 'set_gpus', 'Timer', 'TimePredictor', 'set_tf32_mode', 'setup_determinacy', 'set_env', - 'get_rank', 'get_local_rank', 'get_world_size', 'is_rank', 'is_master', 'master_only', - 'NamedForwardHook', 'NamedBackwardHook', 'get_logger' + 'set_visible_devices', 'set_tf32_mode', 'setup_determinacy', 'set_env', 'get_rank', 'get_local_rank', 'get_world_size', 'is_rank', + 'is_master', 'master_only', 'get_logger', 'NamedForwardHook', 'NamedBackwardHook', 'Timer', 'TimePredictor' ] diff --git a/easytorch/utils/dist.py b/easytorch/utils/dist.py index 409450f..850c0c4 100644 --- a/easytorch/utils/dist.py +++ b/easytorch/utils/dist.py @@ -2,6 +2,7 @@ import torch +from ..device import get_device_count # default master rank MASTER_RANK = 0 @@ -30,7 +31,7 @@ def get_local_rank() -> int: local_rank (int) """ - return get_rank() % torch.cuda.device_count() if torch.cuda.device_count() != 0 else 0 + return get_rank() % get_device_count() if get_device_count() != 0 else 0 def get_world_size() -> int: diff --git a/easytorch/utils/env.py b/easytorch/utils/env.py index 8bf18a2..4610e81 100644 --- a/easytorch/utils/env.py +++ b/easytorch/utils/env.py @@ -7,24 +7,28 @@ from .logging import get_logger from .dist import get_rank +from ..device import get_device_type, set_device_manual_seed -def set_gpus(gpus: str): +def set_visible_devices(devices: str): """Set environment variable `CUDA_VISIBLE_DEVICES` to select GPU devices. Examples: - set_gpus('0,1,2,3') + set_devices('0,1,2,3') Args: - gpus (str): environment variable `CUDA_VISIBLE_DEVICES` value + devices (str): environment variable `CUDA_VISIBLE_DEVICES` value """ logger = get_logger('easytorch-env') - if gpus is not None: - os.environ['CUDA_VISIBLE_DEVICES'] = gpus - logger.info('Use GPUs {}.'.format(gpus)) + if devices is not None: + os.environ[{ + 'gpu': 'CUDA_VISIBLE_DEVICES', + 'mlu': 'MLU_VISIBLE_DEVICES' + }[get_device_type()]] = devices + logger.info('Use devices {}.'.format(devices)) else: - logger.info('Use all GPUs.') + logger.info('Use all devices.') def set_tf32_mode(tf32_mode: bool): @@ -36,17 +40,21 @@ def set_tf32_mode(tf32_mode: bool): """ logger = get_logger('easytorch-env') - if torch.__version__ >= '1.7.0': - if tf32_mode: - logger.info('Enable TF32 mode') + if get_device_type() == 'gpu': + if torch.__version__ >= '1.7.0': + if tf32_mode: + logger.info('Enable TF32 mode') + else: + # disable tf32 mode on Ampere gpu + torch.backends.cuda.matmul.allow_tf32 = False + torch.backends.cudnn.allow_tf32 = False + logger.info('Disable TF32 mode') else: - # disable tf32 mode on Ampere gpu - torch.backends.cuda.matmul.allow_tf32 = False - torch.backends.cudnn.allow_tf32 = False - logger.info('Disable TF32 mode') + if tf32_mode: + raise RuntimeError('Torch version {} does not support tf32'.format(torch.__version__)) else: if tf32_mode: - raise RuntimeError('Torch version {} does not support tf32'.format(torch.__version__)) + raise RuntimeError('Device {} does not support tf32.'.format(get_device_type())) def setup_determinacy(seed: int, deterministic: bool = False, cudnn_enabled: bool = True, @@ -73,12 +81,12 @@ def setup_determinacy(seed: int, deterministic: bool = False, cudnn_enabled: boo random.seed(seed) np.random.seed(seed) - torch.manual_seed(seed) - torch.cuda.manual_seed(seed) - torch.cuda.manual_seed_all(seed) + set_device_manual_seed(seed) if deterministic: - os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8' + if get_device_type() == 'gpu': + os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8' + if torch.__version__ < '1.7.0': pass elif torch.__version__ < '1.8.0': @@ -86,15 +94,17 @@ def setup_determinacy(seed: int, deterministic: bool = False, cudnn_enabled: boo else: torch.use_deterministic_algorithms(True) logger.info('Use deterministic algorithms.') - if not cudnn_enabled: - torch.backends.cudnn.enabled = False - logger.info('Unset cudnn enabled.') - if not cudnn_benchmark: - torch.backends.cudnn.benchmark = False - logger.info('Unset cudnn benchmark.') - if cudnn_deterministic: - torch.backends.cudnn.deterministic = True - logger.info('Set cudnn deterministic.') + + if get_device_type() == 'gpu': + if not cudnn_enabled: + torch.backends.cudnn.enabled = False + logger.info('Unset cudnn enabled.') + if not cudnn_benchmark: + torch.backends.cudnn.benchmark = False + logger.info('Unset cudnn benchmark.') + if cudnn_deterministic: + torch.backends.cudnn.deterministic = True + logger.info('Set cudnn deterministic.') def set_env(env_cfg: Dict): @@ -122,12 +132,11 @@ def set_env(env_cfg: Dict): # determinacy seed = env_cfg.get('SEED') if seed is not None: - cudnn = env_cfg.get('CUDNN', {}) # each rank has different seed in distributed mode setup_determinacy( seed + get_rank(), env_cfg.get('DETERMINISTIC', False), - cudnn.get('ENABLED', True), - cudnn.get('BENCHMARK', True), - cudnn.get('DETERMINISTIC', False) + env_cfg.get('CUDNN.ENABLED', True), + env_cfg.get('CUDNN.BENCHMARK', True), + env_cfg.get('CUDNN.DETERMINISTIC', False) ) diff --git a/easytorch/version.py b/easytorch/version.py index 4629418..97e862c 100644 --- a/easytorch/version.py +++ b/easytorch/version.py @@ -1,2 +1,2 @@ -__version__ = '1.2.12' +__version__ = '1.3' __all__ = ['__version__'] diff --git a/examples/imagenet/configs/resnet50_8x_mlu_cfg.py b/examples/imagenet/configs/resnet50_8x_mlu_cfg.py new file mode 100644 index 0000000..414a1f6 --- /dev/null +++ b/examples/imagenet/configs/resnet50_8x_mlu_cfg.py @@ -0,0 +1,66 @@ +import os +from easytorch import Config + +from imagenet_runner import ImagenetRunner + +CFG = Config() + +CFG.DESC = 'imagenet resnet50' +CFG.RUNNER = ImagenetRunner +CFG.MLU_NUM = 8 +CFG.DIST_BACKEND = 'cncl' + +CFG.MODEL = Config() +CFG.MODEL.NAME = 'resnet50' + +CFG.TRAIN = Config() + +CFG.TRAIN.NUM_EPOCHS = 90 +CFG.TRAIN.CKPT_SAVE_DIR = os.path.join( + 'checkpoints', + '_'.join([CFG.MODEL.NAME, str(CFG.TRAIN.NUM_EPOCHS)]) +) +CFG.TRAIN.CKPT_SAVE_STRATEGY = None + +CFG.TRAIN.OPTIM = Config() +CFG.TRAIN.OPTIM.TYPE = 'SGD' +CFG.TRAIN.OPTIM.PARAM = { + 'lr': 0.1, + 'momentum': 0.9, + 'weight_decay': 1e-4 +} + +CFG.TRAIN.LR_SCHEDULER = Config() +CFG.TRAIN.LR_SCHEDULER.TYPE = 'StepLR' +CFG.TRAIN.LR_SCHEDULER.PARAM = { + 'step_size': 30, + 'gamma': 0.1 +} + +IMAGENET_PATH = 'datasets/imagenet/jpegs' + +CFG.TRAIN.DATA = Config() +CFG.TRAIN.DATA.BATCH_SIZE = 32 +CFG.TRAIN.DATA.NUM_WORKERS = 4 +CFG.TRAIN.DATA.SHUFFLE = True + +CFG.TRAIN.DATA.DIR = os.path.join(IMAGENET_PATH, 'train') +CFG.TRAIN.DATA.CROP_SIZE = 224 +CFG.TRAIN.DATA.NORMALIZE = { + 'mean': [0.485, 0.456, 0.406], + 'std': [0.229, 0.224, 0.225] +} + +CFG.VAL = Config() + +CFG.VAL.INTERVAL = 1 + +CFG.VAL.DATA = Config() +CFG.VAL.DATA.BATCH_SIZE = 32 +CFG.VAL.DATA.DIR = os.path.join(IMAGENET_PATH, 'val') +CFG.VAL.DATA.CROP_SIZE = 224 +CFG.VAL.DATA.RESIZE = 256 +CFG.VAL.DATA.NORMALIZE = { + 'mean': [0.485, 0.456, 0.406], + 'std': [0.229, 0.224, 0.225] +} diff --git a/examples/imagenet/imagenet_runner.py b/examples/imagenet/imagenet_runner.py index 268eba0..c779260 100644 --- a/examples/imagenet/imagenet_runner.py +++ b/examples/imagenet/imagenet_runner.py @@ -6,6 +6,7 @@ from torchvision import models, datasets, transforms from easytorch import Runner +from easytorch.device import to_device def accuracy(output, target, topk=(1,)): @@ -33,7 +34,7 @@ def __init__(self, cfg: Dict): super().__init__(cfg) self.criterion = nn.CrossEntropyLoss() - self.criterion = self.to_running_device(self.criterion) + self.criterion = to_device(self.criterion) def init_training(self, cfg: Dict): super().init_training(cfg) @@ -80,8 +81,8 @@ def build_val_dataset(cfg: Dict): def train_iters(self, epoch: int, iter_index: int, data: Union[torch.Tensor, Tuple]) -> torch.Tensor: images, target = data - images = self.to_running_device(images) - target = self.to_running_device(target) + images = to_device(images) + target = to_device(target) output = self.model(images) @@ -99,8 +100,8 @@ def train_iters(self, epoch: int, iter_index: int, data: Union[torch.Tensor, Tup def val_iters(self, iter_index: int, data: Union[torch.Tensor, Tuple]): images, target = data - images = self.to_running_device(images) - target = self.to_running_device(target) + images = to_device(images) + target = to_device(target) output = self.model(images) diff --git a/examples/imagenet/validate.py b/examples/imagenet/validate.py index 04e4ca5..3de0876 100644 --- a/examples/imagenet/validate.py +++ b/examples/imagenet/validate.py @@ -7,7 +7,8 @@ def parse_args(): parser = ArgumentParser(description='Welcome to EasyTorch!') parser.add_argument('-c', '--cfg', help='training config', required=True) parser.add_argument('--ckpt', help='ckpt path. if it is None, load default ckpt in ckpt save dir', type=str) - parser.add_argument('--gpus', help='visible gpus', type=str) + parser.add_argument('--device-type', help='device type', type=str, default='gpu') + parser.add_argument('--devices', help='visible devices', type=str) return parser.parse_args() @@ -22,4 +23,4 @@ def main(cfg: dict, runner: Runner, ckpt: str = None): if __name__ == '__main__': args = parse_args() - launch_runner(args.cfg, main, (args.ckpt, ), gpus=args.gpus) + launch_runner(args.cfg, main, (args.ckpt, ), device_type=args.device_type, devices=args.devices) diff --git a/examples/linear_regression/linear_regression_runner.py b/examples/linear_regression/linear_regression_runner.py index 395345f..e9381aa 100644 --- a/examples/linear_regression/linear_regression_runner.py +++ b/examples/linear_regression/linear_regression_runner.py @@ -1,6 +1,7 @@ from torch import nn from easytorch import Runner +from easytorch.device import to_device from dataset import LinearDataset @@ -21,7 +22,7 @@ def init_training(self, cfg): super().init_training(cfg) self.loss = nn.MSELoss() - self.loss = self.to_running_device(self.loss) + self.loss = to_device(self.loss) self.register_epoch_meter('train_loss', 'train', '{:.2f}') @@ -68,8 +69,8 @@ def train_iters(self, epoch, iter_index, data): """ x, y = data - x = self.to_running_device(x) - y = self.to_running_device(y) + x = to_device(x) + y = to_device(y) output = self.model(x) loss = self.loss(output, y) diff --git a/examples/mnist/mnist_runner.py b/examples/mnist/mnist_runner.py index e12e525..e8e588e 100644 --- a/examples/mnist/mnist_runner.py +++ b/examples/mnist/mnist_runner.py @@ -5,6 +5,7 @@ import torchvision from easytorch import Runner +from easytorch.device import to_device from conv_net import ConvNet @@ -25,7 +26,7 @@ def init_training(self, cfg: Dict): super().init_training(cfg) self.loss = nn.NLLLoss() - self.loss = self.to_running_device(self.loss) + self.loss = to_device(self.loss) self.register_epoch_meter('train_loss', 'train', '{:.2f}') @@ -113,8 +114,8 @@ def train_iters(self, epoch: int, iter_index: int, data: Union[torch.Tensor, Tup """ input_, target_ = data - input_ = self.to_running_device(input_) - target_ = self.to_running_device(target_) + input_ = to_device(input_) + target_ = to_device(target_) output = self.model(input_) loss = self.loss(output, target_) @@ -130,8 +131,8 @@ def val_iters(self, iter_index: int, data: Union[torch.Tensor, Tuple]): """ input_, target_ = data - input_ = self.to_running_device(input_) - target_ = self.to_running_device(target_) + input_ = to_device(input_) + target_ = to_device(target_) output = self.model(input_) pred = output.data.max(1, keepdim=True)[1] diff --git a/examples/mnist/validate.py b/examples/mnist/validate.py index 04e4ca5..3de0876 100644 --- a/examples/mnist/validate.py +++ b/examples/mnist/validate.py @@ -7,7 +7,8 @@ def parse_args(): parser = ArgumentParser(description='Welcome to EasyTorch!') parser.add_argument('-c', '--cfg', help='training config', required=True) parser.add_argument('--ckpt', help='ckpt path. if it is None, load default ckpt in ckpt save dir', type=str) - parser.add_argument('--gpus', help='visible gpus', type=str) + parser.add_argument('--device-type', help='device type', type=str, default='gpu') + parser.add_argument('--devices', help='visible devices', type=str) return parser.parse_args() @@ -22,4 +23,4 @@ def main(cfg: dict, runner: Runner, ckpt: str = None): if __name__ == '__main__': args = parse_args() - launch_runner(args.cfg, main, (args.ckpt, ), gpus=args.gpus) + launch_runner(args.cfg, main, (args.ckpt, ), device_type=args.device_type, devices=args.devices) diff --git a/tests/random_test/random_test.py b/tests/random_test/random_test.py index 25b4e87..32135a1 100644 --- a/tests/random_test/random_test.py +++ b/tests/random_test/random_test.py @@ -88,4 +88,4 @@ def build_cfg(): if __name__ == '__main__': cfg_ = build_cfg() - launch_training(cfg_, gpus='0,1,2,3,4,5,6,7,8') + launch_training(cfg_, devices='0,1,2,3,4,5,6,7,8')