Skip to content

Commit

Permalink
support resnet50 training on mthreads
Browse files Browse the repository at this point in the history
  • Loading branch information
mingyuanw-mt committed Sep 11, 2023
1 parent 1fb63f6 commit b29f5a1
Show file tree
Hide file tree
Showing 13 changed files with 484 additions and 16 deletions.
73 changes: 58 additions & 15 deletions README.md

Large diffs are not rendered by default.

19 changes: 19 additions & 0 deletions training/benchmarks/driver/dist_pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,8 @@ def barrier(vendor="nvidia"):
if torch.distributed.is_available() and torch.distributed.is_initialized():
if vendor == "kunlunxin":
torch.distributed.barrier()
elif vendor == "mthreads":
torch.distributed.barrier()
else:
torch.distributed.all_reduce(torch.cuda.FloatTensor(1))
torch.cuda.synchronize()
Expand All @@ -172,6 +174,23 @@ def init_dist_training_env(config):
rank=rank,
world_size=world_size)
config.n_device = torch.distributed.get_world_size()
elif config.vendor == "mthreads":
import torch_musa
if int(os.environ.get("WORLD_SIZE", 1)) <= 1:
config.device = torch.device("musa")
config.n_device = 1
else:
torch.musa.set_device(config.local_rank)
host_addr_full = 'tcp://' + os.environ[
"MASTER_ADDR"] + ':' + os.environ["MASTER_PORT"]
rank = int(os.environ["RANK"])
world_size = int(os.environ["WORLD_SIZE"])
torch.distributed.init_process_group(backend=config.dist_backend,
init_method=host_addr_full,
rank=rank,
world_size=world_size)
config.device = torch.device("musa", config.local_rank)
config.n_device = torch.distributed.get_world_size()
else: # nvidia
if int(os.environ.get("WORLD_SIZE", 1)) <= 1:
config.device = torch.device("cuda")
Expand Down
6 changes: 6 additions & 0 deletions training/benchmarks/driver/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,12 @@ def set_seed(self, seed: int, vendor: str = None):
elif lower_vendor == "ascend":
import mindspore
mindspore.set_seed(seed)
elif lower_vendor == "mthreads":
import torch
import torch_musa
torch.manual_seed(seed)
torch.musa.manual_seed(seed)
torch.musa.manual_seed_all(seed)
else:
# TODO 其他厂商设置seed,在此扩展
pass
70 changes: 70 additions & 0 deletions training/mthreads/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@

# 厂商信息

官网: https://www.mthreads.com/

摩尔线程智能科技(北京)有限责任公司(简称:摩尔线程)是一家以GPU芯片设计为主的集成电路设计企业,专注于研发设计全功能GPU芯片及相关产品,为科技生态合作伙伴提供强大的计算加速能力。公司致力于创新研发面向“元计算”应用的新一代GPU,构建融合视觉计算、3D图形计算、科学计算及人工智能计算的综合计算平台,建立基于云原生GPU计算的生态系统,助力驱动数字经济发展。

摩尔线程MTT S系列全功能GPU支持多样算力,借助覆盖深度学习、图形渲染、视频处理和科学计算的完整MUSA软件栈,可为AI训练、AI推理、大模型、AIGC、云游戏、云渲染、视频云、数字孪生等场景提供通用智能算力支持,旨在为数据中心、智算中心和元计算中心的建设构建坚实算力基础,助力元宇宙中多元应用创新和落地。

MUSA软件栈通过musify CUDA代码迁移工具、计算/通信加速库、mcc编译器、musa运行时和驱动实现对CUDA生态的兼容,帮助用户快速完成代码及应用的迁移。通过torch_musa插件,可以实现MTT S系列GPU对原生PyTroch的对接,用户可以无感的把AI模型运行在摩尔线程全功能GPU上。

# FlagPerf适配验证环境说明
## 环境配置参考
- 硬件
- 机器型号: MCCX D800
- 加速卡型号: MTT S3000 32GB
- CPU型号:Intel(R) Xeon(R) Gold 6330 CPU @ 2.00GHz
- 多机网络类型、带宽: InfiniBand,2*200Gbps
- 软件
- OS版本:Ubuntu 20.04 LTS
- OS kernel版本: 5.4.0-154-generic
- 加速卡驱动版本:2.2.0
- Docker 版本: PyTroch2.0_musa1.4_ec6a747fd342

## 容器镜像信息
- 容器构建信息
- Dockerfile路径:training/mthreads/docker_image/pytorch_2.0/Dockerfile
- 构建后软件安装脚本: training/mthreads/docker_image/pytorch_2.0/pytorch_2.0_install.sh

- 核心软件信息

- AI框架&版本
- PyTorch: v2.0.0

- 其它软件版本
- torch_musa: 2.0.0+git8ea3501
- musa toolkits: 1.4.0+git4e25703
- mcc: 1.4.0+git5a5bcc07
- mublas: 1.1.0+gite484aa2


## 加速卡监控采集
- 加速卡使用信息采集命令

```bash
mthreads-gmi -q | grep -E 'GPU Current Temp|Power Draw|Used|Total|Gpu' | \
awk -F ': *' '/GPU Current Temp|Power Draw|Used|Total|Gpu/ \
{ values[(NR-1)%5+1] = $2; } NR % 5 == 0 { print values[4], values[5], values[2], values[1], values[3]; }'
```
- 监控项示例:
```bash
45C 109.51W 1MiB 32768MiB 0%
44C 108.95W 1MiB 32768MiB 0%
46C 110.87W 1MiB 32768MiB 0%
43C 104.33W 1MiB 32768MiB 0%
44C 107.55W 8MiB 32768MiB 0%
46C 110.51W 8MiB 32768MiB 0%
44C 106.59W 8MiB 32768MiB 0%
44C 104.58W 8MiB 32768MiB 0%
```
- 加速卡使用信息采集项说明

|监控项| 日志文件 | 格式 |
|---|---|---|
|温度| mthreads_monitor.log | xxx C |
|功耗 |mthreads_monitor.log | xxx W |
|显存占用大小 |mthreads_monitor.log |xxx MiB |
|总显存大小 |mthreads_monitor.log |xxx MiB |
|显存使用率 |mthreads_monitor.log |xxx % |

4 changes: 4 additions & 0 deletions training/mthreads/docker_image/pytorch_2.0/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# TODO modify Dockerfile when ready to publish
FROM sh-harbor.mthreads.com/mt-ai/flagperf-musa-pytorch:test_git8ea3501_env0.16

ENV PATH /opt/conda/envs/py38/bin:$PATH
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
#!/bin/bash
258 changes: 258 additions & 0 deletions training/mthreads/mthreads_monitor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,258 @@
# !/usr/bin/env python3
# encoding: utf-8
'''
Usage: python3 sys-monitor.py -o operation -l [log_path]
-o, --operation start|stop|restart|status
-l, --log log path , ./logs/ default
'''

import os
import sys
import time
import signal
import atexit
import argparse
import datetime
from multiprocessing import Process
import subprocess
import schedule


class Daemon:
'''
daemon subprocess class.
usage: subclass this daemon and override the run() method.
sys-monitor.pid: in the /tmp/, auto del when unexpected exit.
verbose: debug mode, disabled default.
'''

def __init__(self,
pid_file,
log_file,
err_file,
gpu_log,
log_path,
rate=5,
stdin=os.devnull,
stdout=os.devnull,
stderr=os.devnull,
home_dir='.',
umask=0o22,
verbose=0):
self.stdin = stdin
self.stdout = stdout
self.stderr = stderr
self.home_dir = home_dir
self.verbose = verbose
self.pidfile = pid_file
self.logfile = log_file
self.errfile = err_file
self.gpufile = gpu_log
self.logpath = log_path
self.rate = rate
self.umask = umask
self.verbose = verbose
self.daemon_alive = True

def get_pid(self):
try:
with open(self.pidfile, 'r') as pf:
pid = int(pf.read().strip())
except IOError:
pid = None
except SystemExit:
pid = None
return pid

def del_pid(self):
if os.path.exists(self.pidfile):
os.remove(self.pidfile)

def run(self):
'''
NOTE: override the method in subclass
'''

def gpu_mon(file):
TIMESTAMP = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
# TODO more elegant way?
cmd = "mthreads-gmi -q | grep -E 'GPU Current Temp|Power Draw|Used|Total|Gpu' | "
cmd += "awk -F ': *' '/GPU Current Temp|Power Draw|Used|Total|Gpu/ { values[(NR-1)%5+1] = $2; } NR % 5 == 0 { print values[4], values[5], values[2], values[1], values[3]; }'"
process = subprocess.Popen(cmd,
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
encoding='utf-8')
try:
out = process.communicate(timeout=10)
except subprocess.TimeoutExpired:
process.kill()
out = process.communicate()

if process.returncode != 0:
result = "error"
result = TIMESTAMP + "\n" + out[0] + "\n"
with open(file, 'a') as f:
f.write(result)

def timer_gpu_mon():
gpu_process = Process(target=gpu_mon, args=(self.gpufile, ))
gpu_process.start()

schedule.every(self.rate).seconds.do(timer_gpu_mon)
while True:
schedule.run_pending()
time.sleep(5)

def daemonize(self):
if self.verbose >= 1:
print('daemon process starting ...')
try:
pid = os.fork()
if pid > 0:
sys.exit(0)
except OSError as e:
sys.stderr.write('fork #1 failed: %d (%s)\n' %
(e.errno, e.strerror))
sys.exit(1)
os.chdir(self.home_dir)
os.setsid()
os.umask(self.umask)
try:
pid = os.fork()
if pid > 0:
sys.exit(0)
except OSError as e:
sys.stderr.write('fork #2 failed: %d (%s)\n' %
(e.errno, e.strerror))
sys.exit(1)
sys.stdout.flush()
sys.stderr.flush()
si = open(self.stdin, 'r')
so = open(self.stdout, 'a+')
if self.stderr:
se = open(self.stderr, 'a+')
else:
se = so
os.dup2(si.fileno(), sys.stdin.fileno())
os.dup2(so.fileno(), sys.stdout.fileno())
os.dup2(se.fileno(), sys.stderr.fileno())
atexit.register(self.del_pid)
pid = str(os.getpid())
with open(self.pidfile, 'w+') as f:
f.write('%s\n' % pid)

def start(self):
if not os.path.exists(self.logpath):
os.makedirs(self.logpath)
elif os.path.exists(self.gpufile):
os.remove(self.gpufile)
if self.verbose >= 1:
print('ready to start ......')
# check for a pid file to see if the daemon already runs
pid = self.get_pid()
if pid:
msg = 'pid file %s already exists, is it already running?\n'
sys.stderr.write(msg % self.pidfile)
sys.exit(1)
# start the daemon
self.daemonize()
self.run()

def stop(self):
if self.verbose >= 1:
print('stopping ...')
pid = self.get_pid()
if not pid:
msg = 'pid file [%s] does not exist. Not running?\n' % self.pidfile
sys.stderr.write(msg)
if os.path.exists(self.pidfile):
os.remove(self.pidfile)
return
# try to kill the daemon process
try:
i = 0
while 1:
os.kill(pid, signal.SIGTERM)
time.sleep(1)
i = i + 1
if i % 10 == 0:
os.kill(pid, signal.SIGHUP)
except OSError as err:
err = str(err)
if err.find('No such process') > 0:
if os.path.exists(self.pidfile):
os.remove(self.pidfile)
else:
print(str(err))
sys.exit(1)
if self.verbose >= 1:
print('Stopped!')

def restart(self):
self.stop()
self.start()

def status(self):
pid = self.get_pid()
if pid:
if os.path.exists('/proc/%d' % pid):
return pid
return False


def parse_args():
''' Check script input parameter. '''
parse = argparse.ArgumentParser(description='Sys monitor script')
parse.add_argument('-o',
type=str,
metavar='[operation]',
required=True,
help='start|stop|restart|status')
parse.add_argument('-l',
type=str,
metavar='[log_path]',
required=False,
default='./logs/',
help='log path')
args = parse.parse_args()
return args


def main():
sample_rate1 = 5
args = parse_args()
operation = args.o
log_path = args.l
pid_fn = str('/tmp/gpu_monitor.pid')
log_fn = str(log_path + '/mthreads_monitor.log')
err_fn = str(log_path + '/mthreads_monitor.err')
# result for gpu
gpu_fn = str(log_path + '/mthreads_monitor.log')

subdaemon = Daemon(pid_fn,
log_fn,
err_fn,
gpu_fn,
log_path,
verbose=1,
rate=sample_rate1)
if operation == 'start':
subdaemon.start()
elif operation == 'stop':
subdaemon.stop()
elif operation == 'restart':
subdaemon.restart()
elif operation == 'status':
pid = subdaemon.status()
if pid:
print('process [%s] is running ......' % pid)
else:
print('daemon process [%s] stopped' % pid)
else:
print("invalid argument!")
sys.exit(1)


if __name__ == '__main__':
main()
Loading

0 comments on commit b29f5a1

Please sign in to comment.