From 983b725c12a7032481d039d6190764816e0e1ddb Mon Sep 17 00:00:00 2001
From: Yuhao Wang <45590791+cnstark@users.noreply.github.com>
Date: Tue, 25 Oct 2022 18:14:20 +0800
Subject: [PATCH] feature: mlu (#70)

* feature: support MLU

* fix

* fix

* fix: env

* fix

* fix: to_device

* move device

* fix

* fix: launch_runner
---
 easytorch/__init__.py                         |  3 +-
 easytorch/core/checkpoint.py                  | 16 ++--
 easytorch/core/runner.py                      | 25 +------
 easytorch/device.py                           | 74 +++++++++++++++++++
 easytorch/entry_points/easytrain.py           |  4 +-
 easytorch/launcher/dist_wrap.py               | 33 +++++----
 easytorch/launcher/launcher.py                | 35 ++++++---
 easytorch/utils/__init__.py                   | 10 +--
 easytorch/utils/dist.py                       |  3 +-
 easytorch/utils/env.py                        | 73 ++++++++++--------
 easytorch/version.py                          |  2 +-
 .../imagenet/configs/resnet50_8x_mlu_cfg.py   | 66 +++++++++++++++++
 examples/imagenet/imagenet_runner.py          | 11 +--
 examples/imagenet/validate.py                 |  5 +-
 .../linear_regression_runner.py               |  7 +-
 examples/mnist/mnist_runner.py                | 11 +--
 examples/mnist/validate.py                    |  5 +-
 tests/random_test/random_test.py              |  2 +-
 18 files changed, 271 insertions(+), 114 deletions(-)
 create mode 100644 easytorch/device.py
 create mode 100644 examples/imagenet/configs/resnet50_8x_mlu_cfg.py

diff --git a/easytorch/__init__.py b/easytorch/__init__.py
index 06703ed..5b0d9e0 100644
--- a/easytorch/__init__.py
+++ b/easytorch/__init__.py
@@ -4,6 +4,5 @@
 from .version import __version__
 
 __all__ = [
-    'Config', 'import_config', 'Runner', 'Runner', 'AvgMeter', 'MeterPool', 'launch_runner',
-    'launch_training', '__version__'
+    'Config', 'import_config', 'Runner', 'AvgMeter', 'MeterPool', 'launch_runner', 'launch_training', '__version__'
 ]
diff --git a/easytorch/core/checkpoint.py b/easytorch/core/checkpoint.py
index 41809a1..00aa049 100644
--- a/easytorch/core/checkpoint.py
+++ b/easytorch/core/checkpoint.py
@@ -7,6 +7,8 @@
 import torch
 
 from ..utils import get_logger, get_local_rank
+from ..device import get_device_type
+
 
 DEFAULT_LOGGER = get_logger('easytorch-checkpoint')
 
@@ -28,8 +30,7 @@ def get_last_ckpt_path(ckpt_save_dir: str, name_pattern: str = r'^.+_[\d]*.pt$')
     return os.path.join(ckpt_save_dir, ckpt_list[-1])
 
 
-def load_ckpt(ckpt_save_dir: str, ckpt_path: str = None, use_gpu: bool = True,
-              logger: Logger = DEFAULT_LOGGER) -> Dict:
+def load_ckpt(ckpt_save_dir: str, ckpt_path: str = None, logger: Logger = DEFAULT_LOGGER) -> Dict:
     """Load checkpoint
     if param `ckpt_path` is None, load the last checkpoint in `ckpt_save_dir`,
     else load checkpoint from `ckpt_path`
@@ -37,7 +38,6 @@ def load_ckpt(ckpt_save_dir: str, ckpt_path: str = None, use_gpu: bool = True,
     Args:
         ckpt_save_dir (str): checkpoint save directory
         ckpt_path (str): checkpoint path, default is None
-        use_gpu (bool): set to ``True`` to load checkpoint to GPU
         logger (Logger): logger, default is Logger('easytorch')
 
     Returns:
@@ -46,10 +46,12 @@ def load_ckpt(ckpt_save_dir: str, ckpt_path: str = None, use_gpu: bool = True,
 
     if ckpt_path is None:
         ckpt_path = get_last_ckpt_path(ckpt_save_dir)
-    if use_gpu:
-        map_location = 'cuda:{}'.format(get_local_rank())
-    else:
-        map_location = 'cpu'
+    map_location = {
+        'gpu': 'cuda:{}'.format(get_local_rank()),
+        'mlu': None,
+        'cpu': 'cpu'
+    }[get_device_type()]
+
     logger.info('Loading Checkpoint from \'{}\''.format(ckpt_path))
     return torch.load(ckpt_path, map_location=map_location)
 
diff --git a/easytorch/core/runner.py b/easytorch/core/runner.py
index acc4c94..caf6565 100644
--- a/easytorch/core/runner.py
+++ b/easytorch/core/runner.py
@@ -18,6 +18,7 @@
 from .optimizer_builder import build_optim, build_lr_scheduler
 from ..config import Config, get_ckpt_save_dir
 from ..utils import TimePredictor, get_logger, get_local_rank, is_master, master_only, set_env
+from ..device import to_device
 
 
 class Runner(metaclass=ABCMeta):
@@ -32,7 +33,6 @@ def __init__(self, cfg: Config):
         set_env(cfg.get('ENV', {}))
 
         # param
-        self.use_gpu = cfg.get('GPU_NUM', 0) != 0
         self.model_name = cfg['MODEL.NAME']
         self.ckpt_save_dir = get_ckpt_save_dir(cfg)
         self.logger.info('Set ckpt save dir: \'{}\''.format(self.ckpt_save_dir))
@@ -86,22 +86,6 @@ def init_logger(self, logger: logging.Logger = None, logger_name: str = None,
         else:
             raise TypeError('At least one of logger and logger_name is not None')
 
-    def to_running_device(self, src: Union[torch.Tensor, nn.Module]) -> Union[torch.Tensor, nn.Module]:
-        """Move `src` to the running device. If `self.use_gpu` is ```True```,
-        the running device is GPU, else the running device is CPU.
-
-        Args:
-            src (Union[torch.Tensor, nn.Module]): source
-
-        Returns:
-            target (Union[torch.Tensor, nn.Module])
-        """
-
-        if self.use_gpu:
-            return src.cuda()
-        else:
-            return src.cpu()
-
     @staticmethod
     @abstractmethod
     def define_model(cfg: Config) -> nn.Module:
@@ -198,7 +182,7 @@ def build_model(self, cfg: Config) -> nn.Module:
 
         self.logger.info('Building model.')
         model = self.define_model(cfg)
-        model = self.to_running_device(model)
+        model = to_device(model)
         if torch.distributed.is_initialized():
             model = DDP(
                 model,
@@ -273,7 +257,7 @@ def load_model_resume(self, strict: bool = True):
         """
 
         try:
-            checkpoint_dict = load_ckpt(self.ckpt_save_dir, use_gpu=self.use_gpu, logger=self.logger)
+            checkpoint_dict = load_ckpt(self.ckpt_save_dir, logger=self.logger)
             if isinstance(self.model, DDP):
                 self.model.module.load_state_dict(checkpoint_dict['model_state_dict'], strict=strict)
             else:
@@ -301,8 +285,7 @@ def load_model(self, ckpt_path: str = None, strict: bool = True):
         """
 
         try:
-            checkpoint_dict = load_ckpt(self.ckpt_save_dir, ckpt_path=ckpt_path, use_gpu=self.use_gpu,
-                                        logger=self.logger)
+            checkpoint_dict = load_ckpt(self.ckpt_save_dir, ckpt_path=ckpt_path, logger=self.logger)
             if isinstance(self.model, DDP):
                 self.model.module.load_state_dict(checkpoint_dict['model_state_dict'], strict=strict)
             else:
diff --git a/easytorch/device.py b/easytorch/device.py
new file mode 100644
index 0000000..7db5654
--- /dev/null
+++ b/easytorch/device.py
@@ -0,0 +1,74 @@
+from typing import Union
+
+import torch
+from torch import nn
+
+__all__ = [
+    'get_device_type', 'set_device_type', 'get_device_count', 'set_device', 'to_device', 'set_device_manual_seed'
+]
+
+_DEVICE_TYPE = 'gpu'
+
+
+def get_device_type() -> str:
+    return _DEVICE_TYPE
+
+
+def set_device_type(device_type: str):
+    global _DEVICE_TYPE
+    if device_type not in ['gpu', 'mlu', 'cpu']:
+        raise ValueError('Unknown device type!')
+    if device_type == 'mlu':
+        __import__('torch_mlu')
+    _DEVICE_TYPE = device_type
+
+
+def get_device_count() -> int:
+    if _DEVICE_TYPE == 'gpu':
+        return torch.cuda.device_count()
+    elif _DEVICE_TYPE == 'mlu':
+        torch_mlu = __import__('torch_mlu')
+        return torch_mlu.mlu_model.device_count()
+    elif _DEVICE_TYPE == 'cpu':
+        return 0
+    else:
+        raise ValueError('Unknown device type!')
+
+
+def set_device(device_id: int):
+    if _DEVICE_TYPE == 'gpu':
+        torch.cuda.set_device(device_id)
+    elif _DEVICE_TYPE == 'mlu':
+        torch_mlu = __import__('torch_mlu')
+        torch_mlu.mlu_model.set_device(device_id)
+    else:
+        raise ValueError('Unknown device type!')
+
+
+def to_device(src: Union[torch.Tensor, nn.Module], device_id: int = None) -> Union[torch.Tensor, nn.Module]:
+    if _DEVICE_TYPE == 'gpu':
+        if device_id is None:
+            return src.cuda()
+        else:
+            return src.to('cuda:{:d}'.format(device_id))
+    elif _DEVICE_TYPE == 'mlu':
+        __import__('torch_mlu')
+        if device_id is None:
+            return src.mlu()
+        else:
+            return src.to('mlu:{:d}'.format(device_id))
+    elif _DEVICE_TYPE == 'cpu':
+        return src.cpu()
+    else:
+        raise ValueError('Unknown device type!')
+
+
+def set_device_manual_seed(seed: int):
+    torch.manual_seed(seed)
+    if _DEVICE_TYPE == 'gpu':
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+    elif _DEVICE_TYPE == 'mlu':
+        torch_mlu = __import__('torch_mlu')
+        torch_mlu.mlu_model.manual_seed(seed)
+        torch_mlu.mlu_model.manual_seed_all(seed)
diff --git a/easytorch/entry_points/easytrain.py b/easytorch/entry_points/easytrain.py
index e09e4e2..654b1e8 100644
--- a/easytorch/entry_points/easytrain.py
+++ b/easytorch/entry_points/easytrain.py
@@ -9,7 +9,7 @@ def parse_args():
     parser = ArgumentParser(description='Welcome to EasyTorch!')
     parser.add_argument('-c', '--cfg', help='training config', required=True)
     parser.add_argument('--node-rank', default=0, type=int, help='node rank for distributed training')
-    parser.add_argument('--gpus', help='visible gpus', type=str)
+    parser.add_argument('--devices', help='visible devices', type=str)
     return parser.parse_args()
 
 
@@ -22,4 +22,4 @@ def easytrain():
     args = parse_args()
 
     # train
-    launch_training(args.cfg, args.gpus, args.node_rank)
+    launch_training(args.cfg, args.devices, args.node_rank)
diff --git a/easytorch/launcher/dist_wrap.py b/easytorch/launcher/dist_wrap.py
index 1396316..bf1c530 100644
--- a/easytorch/launcher/dist_wrap.py
+++ b/easytorch/launcher/dist_wrap.py
@@ -5,6 +5,7 @@
 import torch
 
 from ..utils import get_logger
+from ..device import get_device_type, set_device_type, get_device_count, set_device
 
 
 def dist_func(local_rank: int, dist_params: Dict[str, Any], func: Callable, *args):
@@ -18,7 +19,7 @@ def dist_func(local_rank: int, dist_params: Dict[str, Any], func: Callable, *arg
 
     logger = get_logger('easytorch-launcher')
 
-    rank = dist_params['gpu_num'] * dist_params['node_rank'] + local_rank
+    rank = dist_params['device_num'] * dist_params['node_rank'] + local_rank
     logger.info(
         'Launching in distributed mode. Distributed parameters:'\
         'word_size={:d}, node_rank={:d}, rank={:d}, local_rank={:d}, dist_backend={}, init_method={}'.format(
@@ -27,6 +28,8 @@ def dist_func(local_rank: int, dist_params: Dict[str, Any], func: Callable, *arg
         )
     )
 
+    set_device_type(dist_params['device_type'])
+
     torch.distributed.init_process_group(
         backend=dist_params['dist_backend'],
         init_method=dist_params['init_method'],
@@ -34,7 +37,7 @@ def dist_func(local_rank: int, dist_params: Dict[str, Any], func: Callable, *arg
         world_size=dist_params['word_size']
     )
 
-    torch.cuda.set_device(local_rank)
+    set_device(local_rank)
 
     args, kwargs = args
     func(*args, **kwargs)
@@ -42,7 +45,7 @@ def dist_func(local_rank: int, dist_params: Dict[str, Any], func: Callable, *arg
 
 def dist_wrap(func: Callable,
         node_num: int = 1,
-        gpu_num: int = 1,
+        device_num: int = 1,
         node_rank: int = 0,
         dist_backend: Optional[Union[str, torch.distributed.Backend]] = None,
         init_method: Optional[str] = None) -> Callable:
@@ -55,7 +58,7 @@ def dist_wrap(func: Callable,
         >>> function_dist = dist_wrap(
         >>>     function,
         >>>     node_num=node_num,
-        >>>     gpu_num=gpu_num,
+        >>>     device_num=device_num,
         >>>     node_rank=node_rank,
         >>>     dist_backend=dist_backend,
         >>>     init_method=init_method
@@ -65,7 +68,7 @@ def dist_wrap(func: Callable,
     Args:
         func (Callable): The function.
         node_num (int, optional): Number of node. Defaults to 1.
-        gpu_num (int, optional): Number of gpus per node. Defaults to 1.
+        device_num (int, optional): Number of devices per node. Defaults to 1.
         node_rank (int, optional): Rank of current node. Defaults to 0.
         dist_backend (Optional[Union[str, distributed.Backend]], optional): The backend of DDP.
             Defaults to None, means using `nccl` as the backend.
@@ -79,23 +82,22 @@ def dist_wrap(func: Callable,
     if node_num < 1:
         raise ValueError('The node_num must be greater than 1!')
 
-    if gpu_num < 0:
-        raise ValueError('The gpu_num must be greater than 0!')
+    if device_num < 0:
+        raise ValueError('The device_num must be greater than 0!')
 
-    word_size = node_num * gpu_num
+    word_size = node_num * device_num
 
     if word_size == 0:
         # CPU mode
         return func
     else:
-        # GPU mode
+        # DEVICE mode
         if node_rank >= node_num:
             raise ValueError('The node_rank must be less than dist_node_num!')
 
-        if gpu_num != torch.cuda.device_count():
-            raise RuntimeError('GPU num not match, cfg.GPU_NUM = {:d}, but torch.cuda.device_count() = {:d}'.format(
-                gpu_num, torch.cuda.device_count()
-            ))
+        if device_num != get_device_count():
+            raise RuntimeError('Device num not match, cfg.DEVICE_NUM = {:d}, ' \
+                'but torch.cuda.device_count() = {:d}'.format(device_num, get_device_count()))
 
         if word_size == 1:
             return func
@@ -112,7 +114,8 @@ def dist_wrap(func: Callable,
             @functools.wraps(func)
             def wrapper(*args, **kwargs):
                 dist_params = {
-                    'gpu_num': gpu_num,
+                    'device_type': get_device_type(),
+                    'device_num': device_num,
                     'node_rank': node_rank,
                     'word_size': word_size,
                     'dist_backend': dist_backend,
@@ -122,7 +125,7 @@ def wrapper(*args, **kwargs):
                 torch.multiprocessing.spawn(
                     dist_func,
                     args=(dist_params, func, args, kwargs),
-                    nprocs=gpu_num,
+                    nprocs=device_num,
                     join=True
                 )
 
diff --git a/easytorch/launcher/launcher.py b/easytorch/launcher/launcher.py
index c972efd..f7f9af9 100644
--- a/easytorch/launcher/launcher.py
+++ b/easytorch/launcher/launcher.py
@@ -2,7 +2,8 @@
 from typing import Callable, Dict, Union, Tuple
 
 from ..config import init_cfg
-from ..utils import set_gpus, get_logger
+from ..utils import set_visible_devices, get_logger
+from ..device import set_device_type
 from .dist_wrap import dist_wrap
 
 
@@ -34,7 +35,7 @@ def training_func(cfg: Dict):
         raise e
 
 
-def launch_training(cfg: Union[Dict, str], gpus: str = None, node_rank: int = 0):
+def launch_training(cfg: Union[Dict, str], devices: str = None, node_rank: int = 0):
     """Launch training process defined by `cfg`.
 
     Support distributed data parallel training when the number of available GPUs is greater than one.
@@ -48,7 +49,7 @@ def launch_training(cfg: Union[Dict, str], gpus: str = None, node_rank: int = 0)
 
     Args:
         cfg (Union[Dict, str]): Easytorch config.
-        gpus (str): set ``CUDA_VISIBLE_DEVICES`` environment variable.
+        devices (str): set ``CUDA_VISIBLE_DEVICES`` environment variable.
         node_rank (int): Rank of the current node.
     """
 
@@ -57,13 +58,27 @@ def launch_training(cfg: Union[Dict, str], gpus: str = None, node_rank: int = 0)
 
     cfg = init_cfg(cfg, node_rank == 0)
 
-    if cfg.get('GPU_NUM', 0) != 0:
-        set_gpus(gpus)
+    if cfg.get('DEVICE') is not None:
+        set_device_type(cfg['DEVICE'])
+        device_num = cfg.get('DEVICE_NUM', 0)
+    elif cfg.get('GPU_NUM', 0) != 0 or cfg.get('MLU_NUM', 0) != 0:
+        if cfg.get('GPU_NUM', 0) != 0 and cfg.get('MLU_NUM', 0) == 0:
+            set_device_type('gpu')
+            device_num = cfg.get('GPU_NUM', 0)
+        elif cfg.get('GPU_NUM', 0) == 0 and cfg.get('MLU_NUM', 0) != 0:
+            set_device_type('mlu')
+            device_num = cfg.get('MLU_NUM', 0)
+        else:
+            raise ValueError('At least one of `CFG.GPU_NUM` and `CFG.MLU_NUM` is 0.')
+        set_visible_devices(devices)
+    else:
+        set_device_type('cpu')
+        device_num = 0
 
     train_dist = dist_wrap(
         training_func,
         node_num=cfg.get('DIST_NODE_NUM', 1),
-        gpu_num=cfg.get('GPU_NUM', 0),
+        device_num=device_num,
         node_rank=node_rank,
         dist_backend=cfg.get('DIST_BACKEND'),
         init_method=cfg.get('DIST_INIT_METHOD')
@@ -71,7 +86,7 @@ def launch_training(cfg: Union[Dict, str], gpus: str = None, node_rank: int = 0)
     train_dist(cfg)
 
 
-def launch_runner(cfg: Union[Dict, str], fn: Callable, args: Tuple = (), gpus: str = None):
+def launch_runner(cfg: Union[Dict, str], fn: Callable, args: Tuple = (), device_type: str = 'gpu', devices: str = None):
     """Launch runner defined by `cfg`, and call `fn`.
 
     Args:
@@ -89,8 +104,10 @@ def launch_runner(cfg: Union[Dict, str], fn: Callable, args: Tuple = (), gpus: s
 
     cfg = init_cfg(cfg, True)
 
-    if cfg.get('GPU_NUM', 0) != 0:
-        set_gpus(gpus)
+    set_device_type(device_type)
+
+    if device_type != 'cpu':
+        set_visible_devices(devices)
 
     # init runner
     runner = cfg['RUNNER'](cfg)
diff --git a/easytorch/utils/__init__.py b/easytorch/utils/__init__.py
index 9732249..d9b2df3 100644
--- a/easytorch/utils/__init__.py
+++ b/easytorch/utils/__init__.py
@@ -1,12 +1,10 @@
-from .env import set_gpus, set_tf32_mode, setup_determinacy, set_env
-from .timer import Timer, TimePredictor
+from .env import set_visible_devices, set_tf32_mode, setup_determinacy, set_env
 from .dist import get_rank, get_local_rank, get_world_size, is_rank, is_master, master_only
 from .logging import get_logger
 from .named_hook import NamedForwardHook, NamedBackwardHook
-
+from .timer import Timer, TimePredictor
 
 __all__ = [
-    'set_gpus', 'Timer', 'TimePredictor', 'set_tf32_mode', 'setup_determinacy', 'set_env',
-    'get_rank', 'get_local_rank', 'get_world_size', 'is_rank', 'is_master', 'master_only',
-    'NamedForwardHook', 'NamedBackwardHook', 'get_logger'
+    'set_visible_devices', 'set_tf32_mode', 'setup_determinacy', 'set_env', 'get_rank', 'get_local_rank', 'get_world_size', 'is_rank',
+    'is_master', 'master_only', 'get_logger', 'NamedForwardHook', 'NamedBackwardHook', 'Timer', 'TimePredictor'
 ]
diff --git a/easytorch/utils/dist.py b/easytorch/utils/dist.py
index 409450f..850c0c4 100644
--- a/easytorch/utils/dist.py
+++ b/easytorch/utils/dist.py
@@ -2,6 +2,7 @@
 
 import torch
 
+from ..device import get_device_count
 
 # default master rank
 MASTER_RANK = 0
@@ -30,7 +31,7 @@ def get_local_rank() -> int:
         local_rank (int)
     """
 
-    return get_rank() % torch.cuda.device_count() if torch.cuda.device_count() != 0 else 0
+    return get_rank() % get_device_count() if get_device_count() != 0 else 0
 
 
 def get_world_size() -> int:
diff --git a/easytorch/utils/env.py b/easytorch/utils/env.py
index 8bf18a2..4610e81 100644
--- a/easytorch/utils/env.py
+++ b/easytorch/utils/env.py
@@ -7,24 +7,28 @@
 
 from .logging import get_logger
 from .dist import get_rank
+from ..device import get_device_type, set_device_manual_seed
 
 
-def set_gpus(gpus: str):
+def set_visible_devices(devices: str):
     """Set environment variable `CUDA_VISIBLE_DEVICES` to select GPU devices.
 
     Examples:
-        set_gpus('0,1,2,3')
+        set_devices('0,1,2,3')
 
     Args:
-        gpus (str): environment variable `CUDA_VISIBLE_DEVICES` value
+        devices (str): environment variable `CUDA_VISIBLE_DEVICES` value
     """
 
     logger = get_logger('easytorch-env')
-    if gpus is not None:
-        os.environ['CUDA_VISIBLE_DEVICES'] = gpus
-        logger.info('Use GPUs {}.'.format(gpus))
+    if devices is not None:
+        os.environ[{
+            'gpu': 'CUDA_VISIBLE_DEVICES',
+            'mlu': 'MLU_VISIBLE_DEVICES'
+        }[get_device_type()]] = devices
+        logger.info('Use devices {}.'.format(devices))
     else:
-        logger.info('Use all GPUs.')
+        logger.info('Use all devices.')
 
 
 def set_tf32_mode(tf32_mode: bool):
@@ -36,17 +40,21 @@ def set_tf32_mode(tf32_mode: bool):
     """
 
     logger = get_logger('easytorch-env')
-    if torch.__version__ >= '1.7.0':
-        if tf32_mode:
-            logger.info('Enable TF32 mode')
+    if get_device_type() == 'gpu':
+        if torch.__version__ >= '1.7.0':
+            if tf32_mode:
+                logger.info('Enable TF32 mode')
+            else:
+                # disable tf32 mode on Ampere gpu
+                torch.backends.cuda.matmul.allow_tf32 = False
+                torch.backends.cudnn.allow_tf32 = False
+                logger.info('Disable TF32 mode')
         else:
-            # disable tf32 mode on Ampere gpu
-            torch.backends.cuda.matmul.allow_tf32 = False
-            torch.backends.cudnn.allow_tf32 = False
-            logger.info('Disable TF32 mode')
+            if tf32_mode:
+                raise RuntimeError('Torch version {} does not support tf32'.format(torch.__version__))
     else:
         if tf32_mode:
-            raise RuntimeError('Torch version {} does not support tf32'.format(torch.__version__))
+            raise RuntimeError('Device {} does not support tf32.'.format(get_device_type()))
 
 
 def setup_determinacy(seed: int, deterministic: bool = False, cudnn_enabled: bool = True,
@@ -73,12 +81,12 @@ def setup_determinacy(seed: int, deterministic: bool = False, cudnn_enabled: boo
     random.seed(seed)
     np.random.seed(seed)
 
-    torch.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-    torch.cuda.manual_seed_all(seed)
+    set_device_manual_seed(seed)
 
     if deterministic:
-        os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
+        if get_device_type() == 'gpu':
+            os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
+
         if torch.__version__ < '1.7.0':
             pass
         elif torch.__version__ < '1.8.0':
@@ -86,15 +94,17 @@ def setup_determinacy(seed: int, deterministic: bool = False, cudnn_enabled: boo
         else:
             torch.use_deterministic_algorithms(True)
         logger.info('Use deterministic algorithms.')
-    if not cudnn_enabled:
-        torch.backends.cudnn.enabled = False
-        logger.info('Unset cudnn enabled.')
-    if not cudnn_benchmark:
-        torch.backends.cudnn.benchmark = False
-        logger.info('Unset cudnn benchmark.')
-    if cudnn_deterministic:
-        torch.backends.cudnn.deterministic = True
-        logger.info('Set cudnn deterministic.')
+
+    if get_device_type() == 'gpu':
+        if not cudnn_enabled:
+            torch.backends.cudnn.enabled = False
+            logger.info('Unset cudnn enabled.')
+        if not cudnn_benchmark:
+            torch.backends.cudnn.benchmark = False
+            logger.info('Unset cudnn benchmark.')
+        if cudnn_deterministic:
+            torch.backends.cudnn.deterministic = True
+            logger.info('Set cudnn deterministic.')
 
 
 def set_env(env_cfg: Dict):
@@ -122,12 +132,11 @@ def set_env(env_cfg: Dict):
     # determinacy
     seed = env_cfg.get('SEED')
     if seed is not None:
-        cudnn = env_cfg.get('CUDNN', {})
         # each rank has different seed in distributed mode
         setup_determinacy(
             seed + get_rank(),
             env_cfg.get('DETERMINISTIC', False),
-            cudnn.get('ENABLED', True),
-            cudnn.get('BENCHMARK', True),
-            cudnn.get('DETERMINISTIC', False)
+            env_cfg.get('CUDNN.ENABLED', True),
+            env_cfg.get('CUDNN.BENCHMARK', True),
+            env_cfg.get('CUDNN.DETERMINISTIC', False)
         )
diff --git a/easytorch/version.py b/easytorch/version.py
index 4629418..97e862c 100644
--- a/easytorch/version.py
+++ b/easytorch/version.py
@@ -1,2 +1,2 @@
-__version__ = '1.2.12'
+__version__ = '1.3'
 __all__ = ['__version__']
diff --git a/examples/imagenet/configs/resnet50_8x_mlu_cfg.py b/examples/imagenet/configs/resnet50_8x_mlu_cfg.py
new file mode 100644
index 0000000..414a1f6
--- /dev/null
+++ b/examples/imagenet/configs/resnet50_8x_mlu_cfg.py
@@ -0,0 +1,66 @@
+import os
+from easytorch import Config
+
+from imagenet_runner import ImagenetRunner
+
+CFG = Config()
+
+CFG.DESC = 'imagenet resnet50'
+CFG.RUNNER = ImagenetRunner
+CFG.MLU_NUM = 8
+CFG.DIST_BACKEND = 'cncl'
+
+CFG.MODEL = Config()
+CFG.MODEL.NAME = 'resnet50'
+
+CFG.TRAIN = Config()
+
+CFG.TRAIN.NUM_EPOCHS = 90
+CFG.TRAIN.CKPT_SAVE_DIR = os.path.join(
+    'checkpoints',
+    '_'.join([CFG.MODEL.NAME, str(CFG.TRAIN.NUM_EPOCHS)])
+)
+CFG.TRAIN.CKPT_SAVE_STRATEGY = None
+
+CFG.TRAIN.OPTIM = Config()
+CFG.TRAIN.OPTIM.TYPE = 'SGD'
+CFG.TRAIN.OPTIM.PARAM = {
+    'lr': 0.1,
+    'momentum': 0.9,
+    'weight_decay': 1e-4
+}
+
+CFG.TRAIN.LR_SCHEDULER = Config()
+CFG.TRAIN.LR_SCHEDULER.TYPE = 'StepLR'
+CFG.TRAIN.LR_SCHEDULER.PARAM = {
+    'step_size': 30,
+    'gamma': 0.1
+}
+
+IMAGENET_PATH = 'datasets/imagenet/jpegs'
+
+CFG.TRAIN.DATA = Config()
+CFG.TRAIN.DATA.BATCH_SIZE = 32
+CFG.TRAIN.DATA.NUM_WORKERS = 4
+CFG.TRAIN.DATA.SHUFFLE = True
+
+CFG.TRAIN.DATA.DIR = os.path.join(IMAGENET_PATH, 'train')
+CFG.TRAIN.DATA.CROP_SIZE = 224
+CFG.TRAIN.DATA.NORMALIZE = {
+    'mean': [0.485, 0.456, 0.406],
+    'std': [0.229, 0.224, 0.225]
+}
+
+CFG.VAL = Config()
+
+CFG.VAL.INTERVAL = 1
+
+CFG.VAL.DATA = Config()
+CFG.VAL.DATA.BATCH_SIZE = 32
+CFG.VAL.DATA.DIR = os.path.join(IMAGENET_PATH, 'val')
+CFG.VAL.DATA.CROP_SIZE = 224
+CFG.VAL.DATA.RESIZE = 256
+CFG.VAL.DATA.NORMALIZE = {
+    'mean': [0.485, 0.456, 0.406],
+    'std': [0.229, 0.224, 0.225]
+}
diff --git a/examples/imagenet/imagenet_runner.py b/examples/imagenet/imagenet_runner.py
index 268eba0..c779260 100644
--- a/examples/imagenet/imagenet_runner.py
+++ b/examples/imagenet/imagenet_runner.py
@@ -6,6 +6,7 @@
 from torchvision import models, datasets, transforms
 
 from easytorch import Runner
+from easytorch.device import to_device
 
 
 def accuracy(output, target, topk=(1,)):
@@ -33,7 +34,7 @@ def __init__(self, cfg: Dict):
         super().__init__(cfg)
 
         self.criterion = nn.CrossEntropyLoss()
-        self.criterion = self.to_running_device(self.criterion)
+        self.criterion = to_device(self.criterion)
 
     def init_training(self, cfg: Dict):
         super().init_training(cfg)
@@ -80,8 +81,8 @@ def build_val_dataset(cfg: Dict):
     def train_iters(self, epoch: int, iter_index: int, data: Union[torch.Tensor, Tuple]) -> torch.Tensor:
         images, target = data
 
-        images = self.to_running_device(images)
-        target = self.to_running_device(target)
+        images = to_device(images)
+        target = to_device(target)
 
         output = self.model(images)
 
@@ -99,8 +100,8 @@ def train_iters(self, epoch: int, iter_index: int, data: Union[torch.Tensor, Tup
     def val_iters(self, iter_index: int, data: Union[torch.Tensor, Tuple]):
         images, target = data
 
-        images = self.to_running_device(images)
-        target = self.to_running_device(target)
+        images = to_device(images)
+        target = to_device(target)
 
         output = self.model(images)
 
diff --git a/examples/imagenet/validate.py b/examples/imagenet/validate.py
index 04e4ca5..3de0876 100644
--- a/examples/imagenet/validate.py
+++ b/examples/imagenet/validate.py
@@ -7,7 +7,8 @@ def parse_args():
     parser = ArgumentParser(description='Welcome to EasyTorch!')
     parser.add_argument('-c', '--cfg', help='training config', required=True)
     parser.add_argument('--ckpt', help='ckpt path. if it is None, load default ckpt in ckpt save dir', type=str)
-    parser.add_argument('--gpus', help='visible gpus', type=str)
+    parser.add_argument('--device-type', help='device type', type=str, default='gpu')
+    parser.add_argument('--devices', help='visible devices', type=str)
     return parser.parse_args()
 
 
@@ -22,4 +23,4 @@ def main(cfg: dict, runner: Runner, ckpt: str = None):
 
 if __name__ == '__main__':
     args = parse_args()
-    launch_runner(args.cfg, main, (args.ckpt, ), gpus=args.gpus)
+    launch_runner(args.cfg, main, (args.ckpt, ), device_type=args.device_type, devices=args.devices)
diff --git a/examples/linear_regression/linear_regression_runner.py b/examples/linear_regression/linear_regression_runner.py
index 395345f..e9381aa 100644
--- a/examples/linear_regression/linear_regression_runner.py
+++ b/examples/linear_regression/linear_regression_runner.py
@@ -1,6 +1,7 @@
 from torch import nn
 
 from easytorch import Runner
+from easytorch.device import to_device
 
 from dataset import LinearDataset
 
@@ -21,7 +22,7 @@ def init_training(self, cfg):
         super().init_training(cfg)
 
         self.loss = nn.MSELoss()
-        self.loss = self.to_running_device(self.loss)
+        self.loss = to_device(self.loss)
 
         self.register_epoch_meter('train_loss', 'train', '{:.2f}')
 
@@ -68,8 +69,8 @@ def train_iters(self, epoch, iter_index, data):
         """
 
         x, y = data
-        x = self.to_running_device(x)
-        y = self.to_running_device(y)
+        x = to_device(x)
+        y = to_device(y)
 
         output = self.model(x)
         loss = self.loss(output, y)
diff --git a/examples/mnist/mnist_runner.py b/examples/mnist/mnist_runner.py
index e12e525..e8e588e 100644
--- a/examples/mnist/mnist_runner.py
+++ b/examples/mnist/mnist_runner.py
@@ -5,6 +5,7 @@
 import torchvision
 
 from easytorch import Runner
+from easytorch.device import to_device
 
 from conv_net import ConvNet
 
@@ -25,7 +26,7 @@ def init_training(self, cfg: Dict):
         super().init_training(cfg)
 
         self.loss = nn.NLLLoss()
-        self.loss = self.to_running_device(self.loss)
+        self.loss = to_device(self.loss)
 
         self.register_epoch_meter('train_loss', 'train', '{:.2f}')
 
@@ -113,8 +114,8 @@ def train_iters(self, epoch: int, iter_index: int, data: Union[torch.Tensor, Tup
         """
 
         input_, target_ = data
-        input_ = self.to_running_device(input_)
-        target_ = self.to_running_device(target_)
+        input_ = to_device(input_)
+        target_ = to_device(target_)
 
         output = self.model(input_)
         loss = self.loss(output, target_)
@@ -130,8 +131,8 @@ def val_iters(self, iter_index: int, data: Union[torch.Tensor, Tuple]):
         """
 
         input_, target_ = data
-        input_ = self.to_running_device(input_)
-        target_ = self.to_running_device(target_)
+        input_ = to_device(input_)
+        target_ = to_device(target_)
 
         output = self.model(input_)
         pred = output.data.max(1, keepdim=True)[1]
diff --git a/examples/mnist/validate.py b/examples/mnist/validate.py
index 04e4ca5..3de0876 100644
--- a/examples/mnist/validate.py
+++ b/examples/mnist/validate.py
@@ -7,7 +7,8 @@ def parse_args():
     parser = ArgumentParser(description='Welcome to EasyTorch!')
     parser.add_argument('-c', '--cfg', help='training config', required=True)
     parser.add_argument('--ckpt', help='ckpt path. if it is None, load default ckpt in ckpt save dir', type=str)
-    parser.add_argument('--gpus', help='visible gpus', type=str)
+    parser.add_argument('--device-type', help='device type', type=str, default='gpu')
+    parser.add_argument('--devices', help='visible devices', type=str)
     return parser.parse_args()
 
 
@@ -22,4 +23,4 @@ def main(cfg: dict, runner: Runner, ckpt: str = None):
 
 if __name__ == '__main__':
     args = parse_args()
-    launch_runner(args.cfg, main, (args.ckpt, ), gpus=args.gpus)
+    launch_runner(args.cfg, main, (args.ckpt, ), device_type=args.device_type, devices=args.devices)
diff --git a/tests/random_test/random_test.py b/tests/random_test/random_test.py
index 25b4e87..32135a1 100644
--- a/tests/random_test/random_test.py
+++ b/tests/random_test/random_test.py
@@ -88,4 +88,4 @@ def build_cfg():
 if __name__ == '__main__':
     cfg_ = build_cfg()
 
-    launch_training(cfg_, gpus='0,1,2,3,4,5,6,7,8')
+    launch_training(cfg_, devices='0,1,2,3,4,5,6,7,8')