-
Notifications
You must be signed in to change notification settings - Fork 109
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
support resnet50 training on mthreads
- Loading branch information
1 parent
1fb63f6
commit b29f5a1
Showing
13 changed files
with
484 additions
and
16 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
|
||
# 厂商信息 | ||
|
||
官网: https://www.mthreads.com/ | ||
|
||
摩尔线程智能科技(北京)有限责任公司(简称:摩尔线程)是一家以GPU芯片设计为主的集成电路设计企业,专注于研发设计全功能GPU芯片及相关产品,为科技生态合作伙伴提供强大的计算加速能力。公司致力于创新研发面向“元计算”应用的新一代GPU,构建融合视觉计算、3D图形计算、科学计算及人工智能计算的综合计算平台,建立基于云原生GPU计算的生态系统,助力驱动数字经济发展。 | ||
|
||
摩尔线程MTT S系列全功能GPU支持多样算力,借助覆盖深度学习、图形渲染、视频处理和科学计算的完整MUSA软件栈,可为AI训练、AI推理、大模型、AIGC、云游戏、云渲染、视频云、数字孪生等场景提供通用智能算力支持,旨在为数据中心、智算中心和元计算中心的建设构建坚实算力基础,助力元宇宙中多元应用创新和落地。 | ||
|
||
MUSA软件栈通过musify CUDA代码迁移工具、计算/通信加速库、mcc编译器、musa运行时和驱动实现对CUDA生态的兼容,帮助用户快速完成代码及应用的迁移。通过torch_musa插件,可以实现MTT S系列GPU对原生PyTroch的对接,用户可以无感的把AI模型运行在摩尔线程全功能GPU上。 | ||
|
||
# FlagPerf适配验证环境说明 | ||
## 环境配置参考 | ||
- 硬件 | ||
- 机器型号: MCCX D800 | ||
- 加速卡型号: MTT S3000 32GB | ||
- CPU型号:Intel(R) Xeon(R) Gold 6330 CPU @ 2.00GHz | ||
- 多机网络类型、带宽: InfiniBand,2*200Gbps | ||
- 软件 | ||
- OS版本:Ubuntu 20.04 LTS | ||
- OS kernel版本: 5.4.0-154-generic | ||
- 加速卡驱动版本:2.2.0 | ||
- Docker 版本: PyTroch2.0_musa1.4_ec6a747fd342 | ||
|
||
## 容器镜像信息 | ||
- 容器构建信息 | ||
- Dockerfile路径:training/mthreads/docker_image/pytorch_2.0/Dockerfile | ||
- 构建后软件安装脚本: training/mthreads/docker_image/pytorch_2.0/pytorch_2.0_install.sh | ||
|
||
- 核心软件信息 | ||
|
||
- AI框架&版本 | ||
- PyTorch: v2.0.0 | ||
|
||
- 其它软件版本 | ||
- torch_musa: 2.0.0+git8ea3501 | ||
- musa toolkits: 1.4.0+git4e25703 | ||
- mcc: 1.4.0+git5a5bcc07 | ||
- mublas: 1.1.0+gite484aa2 | ||
|
||
|
||
## 加速卡监控采集 | ||
- 加速卡使用信息采集命令 | ||
|
||
```bash | ||
mthreads-gmi -q | grep -E 'GPU Current Temp|Power Draw|Used|Total|Gpu' | \ | ||
awk -F ': *' '/GPU Current Temp|Power Draw|Used|Total|Gpu/ \ | ||
{ values[(NR-1)%5+1] = $2; } NR % 5 == 0 { print values[4], values[5], values[2], values[1], values[3]; }' | ||
``` | ||
- 监控项示例: | ||
```bash | ||
45C 109.51W 1MiB 32768MiB 0% | ||
44C 108.95W 1MiB 32768MiB 0% | ||
46C 110.87W 1MiB 32768MiB 0% | ||
43C 104.33W 1MiB 32768MiB 0% | ||
44C 107.55W 8MiB 32768MiB 0% | ||
46C 110.51W 8MiB 32768MiB 0% | ||
44C 106.59W 8MiB 32768MiB 0% | ||
44C 104.58W 8MiB 32768MiB 0% | ||
``` | ||
- 加速卡使用信息采集项说明 | ||
|
||
|监控项| 日志文件 | 格式 | | ||
|---|---|---| | ||
|温度| mthreads_monitor.log | xxx C | | ||
|功耗 |mthreads_monitor.log | xxx W | | ||
|显存占用大小 |mthreads_monitor.log |xxx MiB | | ||
|总显存大小 |mthreads_monitor.log |xxx MiB | | ||
|显存使用率 |mthreads_monitor.log |xxx % | | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
# TODO modify Dockerfile when ready to publish | ||
FROM sh-harbor.mthreads.com/mt-ai/flagperf-musa-pytorch:test_git8ea3501_env0.16 | ||
|
||
ENV PATH /opt/conda/envs/py38/bin:$PATH |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
#!/bin/bash |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,258 @@ | ||
# !/usr/bin/env python3 | ||
# encoding: utf-8 | ||
''' | ||
Usage: python3 sys-monitor.py -o operation -l [log_path] | ||
-o, --operation start|stop|restart|status | ||
-l, --log log path , ./logs/ default | ||
''' | ||
|
||
import os | ||
import sys | ||
import time | ||
import signal | ||
import atexit | ||
import argparse | ||
import datetime | ||
from multiprocessing import Process | ||
import subprocess | ||
import schedule | ||
|
||
|
||
class Daemon: | ||
''' | ||
daemon subprocess class. | ||
usage: subclass this daemon and override the run() method. | ||
sys-monitor.pid: in the /tmp/, auto del when unexpected exit. | ||
verbose: debug mode, disabled default. | ||
''' | ||
|
||
def __init__(self, | ||
pid_file, | ||
log_file, | ||
err_file, | ||
gpu_log, | ||
log_path, | ||
rate=5, | ||
stdin=os.devnull, | ||
stdout=os.devnull, | ||
stderr=os.devnull, | ||
home_dir='.', | ||
umask=0o22, | ||
verbose=0): | ||
self.stdin = stdin | ||
self.stdout = stdout | ||
self.stderr = stderr | ||
self.home_dir = home_dir | ||
self.verbose = verbose | ||
self.pidfile = pid_file | ||
self.logfile = log_file | ||
self.errfile = err_file | ||
self.gpufile = gpu_log | ||
self.logpath = log_path | ||
self.rate = rate | ||
self.umask = umask | ||
self.verbose = verbose | ||
self.daemon_alive = True | ||
|
||
def get_pid(self): | ||
try: | ||
with open(self.pidfile, 'r') as pf: | ||
pid = int(pf.read().strip()) | ||
except IOError: | ||
pid = None | ||
except SystemExit: | ||
pid = None | ||
return pid | ||
|
||
def del_pid(self): | ||
if os.path.exists(self.pidfile): | ||
os.remove(self.pidfile) | ||
|
||
def run(self): | ||
''' | ||
NOTE: override the method in subclass | ||
''' | ||
|
||
def gpu_mon(file): | ||
TIMESTAMP = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S') | ||
# TODO more elegant way? | ||
cmd = "mthreads-gmi -q | grep -E 'GPU Current Temp|Power Draw|Used|Total|Gpu' | " | ||
cmd += "awk -F ': *' '/GPU Current Temp|Power Draw|Used|Total|Gpu/ { values[(NR-1)%5+1] = $2; } NR % 5 == 0 { print values[4], values[5], values[2], values[1], values[3]; }'" | ||
process = subprocess.Popen(cmd, | ||
shell=True, | ||
stdout=subprocess.PIPE, | ||
stderr=subprocess.STDOUT, | ||
encoding='utf-8') | ||
try: | ||
out = process.communicate(timeout=10) | ||
except subprocess.TimeoutExpired: | ||
process.kill() | ||
out = process.communicate() | ||
|
||
if process.returncode != 0: | ||
result = "error" | ||
result = TIMESTAMP + "\n" + out[0] + "\n" | ||
with open(file, 'a') as f: | ||
f.write(result) | ||
|
||
def timer_gpu_mon(): | ||
gpu_process = Process(target=gpu_mon, args=(self.gpufile, )) | ||
gpu_process.start() | ||
|
||
schedule.every(self.rate).seconds.do(timer_gpu_mon) | ||
while True: | ||
schedule.run_pending() | ||
time.sleep(5) | ||
|
||
def daemonize(self): | ||
if self.verbose >= 1: | ||
print('daemon process starting ...') | ||
try: | ||
pid = os.fork() | ||
if pid > 0: | ||
sys.exit(0) | ||
except OSError as e: | ||
sys.stderr.write('fork #1 failed: %d (%s)\n' % | ||
(e.errno, e.strerror)) | ||
sys.exit(1) | ||
os.chdir(self.home_dir) | ||
os.setsid() | ||
os.umask(self.umask) | ||
try: | ||
pid = os.fork() | ||
if pid > 0: | ||
sys.exit(0) | ||
except OSError as e: | ||
sys.stderr.write('fork #2 failed: %d (%s)\n' % | ||
(e.errno, e.strerror)) | ||
sys.exit(1) | ||
sys.stdout.flush() | ||
sys.stderr.flush() | ||
si = open(self.stdin, 'r') | ||
so = open(self.stdout, 'a+') | ||
if self.stderr: | ||
se = open(self.stderr, 'a+') | ||
else: | ||
se = so | ||
os.dup2(si.fileno(), sys.stdin.fileno()) | ||
os.dup2(so.fileno(), sys.stdout.fileno()) | ||
os.dup2(se.fileno(), sys.stderr.fileno()) | ||
atexit.register(self.del_pid) | ||
pid = str(os.getpid()) | ||
with open(self.pidfile, 'w+') as f: | ||
f.write('%s\n' % pid) | ||
|
||
def start(self): | ||
if not os.path.exists(self.logpath): | ||
os.makedirs(self.logpath) | ||
elif os.path.exists(self.gpufile): | ||
os.remove(self.gpufile) | ||
if self.verbose >= 1: | ||
print('ready to start ......') | ||
# check for a pid file to see if the daemon already runs | ||
pid = self.get_pid() | ||
if pid: | ||
msg = 'pid file %s already exists, is it already running?\n' | ||
sys.stderr.write(msg % self.pidfile) | ||
sys.exit(1) | ||
# start the daemon | ||
self.daemonize() | ||
self.run() | ||
|
||
def stop(self): | ||
if self.verbose >= 1: | ||
print('stopping ...') | ||
pid = self.get_pid() | ||
if not pid: | ||
msg = 'pid file [%s] does not exist. Not running?\n' % self.pidfile | ||
sys.stderr.write(msg) | ||
if os.path.exists(self.pidfile): | ||
os.remove(self.pidfile) | ||
return | ||
# try to kill the daemon process | ||
try: | ||
i = 0 | ||
while 1: | ||
os.kill(pid, signal.SIGTERM) | ||
time.sleep(1) | ||
i = i + 1 | ||
if i % 10 == 0: | ||
os.kill(pid, signal.SIGHUP) | ||
except OSError as err: | ||
err = str(err) | ||
if err.find('No such process') > 0: | ||
if os.path.exists(self.pidfile): | ||
os.remove(self.pidfile) | ||
else: | ||
print(str(err)) | ||
sys.exit(1) | ||
if self.verbose >= 1: | ||
print('Stopped!') | ||
|
||
def restart(self): | ||
self.stop() | ||
self.start() | ||
|
||
def status(self): | ||
pid = self.get_pid() | ||
if pid: | ||
if os.path.exists('/proc/%d' % pid): | ||
return pid | ||
return False | ||
|
||
|
||
def parse_args(): | ||
''' Check script input parameter. ''' | ||
parse = argparse.ArgumentParser(description='Sys monitor script') | ||
parse.add_argument('-o', | ||
type=str, | ||
metavar='[operation]', | ||
required=True, | ||
help='start|stop|restart|status') | ||
parse.add_argument('-l', | ||
type=str, | ||
metavar='[log_path]', | ||
required=False, | ||
default='./logs/', | ||
help='log path') | ||
args = parse.parse_args() | ||
return args | ||
|
||
|
||
def main(): | ||
sample_rate1 = 5 | ||
args = parse_args() | ||
operation = args.o | ||
log_path = args.l | ||
pid_fn = str('/tmp/gpu_monitor.pid') | ||
log_fn = str(log_path + '/mthreads_monitor.log') | ||
err_fn = str(log_path + '/mthreads_monitor.err') | ||
# result for gpu | ||
gpu_fn = str(log_path + '/mthreads_monitor.log') | ||
|
||
subdaemon = Daemon(pid_fn, | ||
log_fn, | ||
err_fn, | ||
gpu_fn, | ||
log_path, | ||
verbose=1, | ||
rate=sample_rate1) | ||
if operation == 'start': | ||
subdaemon.start() | ||
elif operation == 'stop': | ||
subdaemon.stop() | ||
elif operation == 'restart': | ||
subdaemon.restart() | ||
elif operation == 'status': | ||
pid = subdaemon.status() | ||
if pid: | ||
print('process [%s] is running ......' % pid) | ||
else: | ||
print('daemon process [%s] stopped' % pid) | ||
else: | ||
print("invalid argument!") | ||
sys.exit(1) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
Oops, something went wrong.