support resnet50 training on mthreads

FlagOpen · Sep 11, 2023 · b29f5a1 · b29f5a1
1 parent 1fb63f6
commit b29f5a1
Show file tree

Hide file tree

Showing 13 changed files with 484 additions and 16 deletions.
diff --git a/README.md b/README.md
diff --git a/training/benchmarks/driver/dist_pytorch.py b/training/benchmarks/driver/dist_pytorch.py
@@ -149,6 +149,8 @@ def barrier(vendor="nvidia"):
     if torch.distributed.is_available() and torch.distributed.is_initialized():
         if vendor == "kunlunxin":
             torch.distributed.barrier()
+        elif vendor == "mthreads":
+            torch.distributed.barrier()
         else:
             torch.distributed.all_reduce(torch.cuda.FloatTensor(1))
             torch.cuda.synchronize()
@@ -172,6 +174,23 @@ def init_dist_training_env(config):
                                                  rank=rank,
                                                  world_size=world_size)
             config.n_device = torch.distributed.get_world_size()
+    elif config.vendor == "mthreads":
+        import torch_musa
+        if int(os.environ.get("WORLD_SIZE", 1)) <= 1:
+            config.device = torch.device("musa")
+            config.n_device = 1
+        else:
+            torch.musa.set_device(config.local_rank)
+            host_addr_full = 'tcp://' + os.environ[
+                "MASTER_ADDR"] + ':' + os.environ["MASTER_PORT"]
+            rank = int(os.environ["RANK"])
+            world_size = int(os.environ["WORLD_SIZE"])
+            torch.distributed.init_process_group(backend=config.dist_backend,
+                                                 init_method=host_addr_full,
+                                                 rank=rank,
+                                                 world_size=world_size)
+            config.device = torch.device("musa", config.local_rank)
+            config.n_device = torch.distributed.get_world_size()
     else:  # nvidia
         if int(os.environ.get("WORLD_SIZE", 1)) <= 1:
             config.device = torch.device("cuda")

diff --git a/training/benchmarks/driver/helper.py b/training/benchmarks/driver/helper.py
@@ -74,6 +74,12 @@ def set_seed(self, seed: int, vendor: str = None):
         elif lower_vendor == "ascend":
             import mindspore
             mindspore.set_seed(seed)
+        elif lower_vendor == "mthreads":
+            import torch
+            import torch_musa
+            torch.manual_seed(seed)
+            torch.musa.manual_seed(seed)
+            torch.musa.manual_seed_all(seed)
         else:
             # TODO 其他厂商设置seed，在此扩展
             pass
diff --git a/training/mthreads/README.md b/training/mthreads/README.md
@@ -0,0 +1,70 @@
+
+# 厂商信息
+
+官网: https://www.mthreads.com/
+
+摩尔线程智能科技（北京）有限责任公司（简称：摩尔线程）是一家以GPU芯片设计为主的集成电路设计企业，专注于研发设计全功能GPU芯片及相关产品，为科技生态合作伙伴提供强大的计算加速能力。公司致力于创新研发面向“元计算”应用的新一代GPU，构建融合视觉计算、3D图形计算、科学计算及人工智能计算的综合计算平台，建立基于云原生GPU计算的生态系统，助力驱动数字经济发展。
+
+摩尔线程MTT  S系列全功能GPU支持多样算力，借助覆盖深度学习、图形渲染、视频处理和科学计算的完整MUSA软件栈，可为AI训练、AI推理、大模型、AIGC、云游戏、云渲染、视频云、数字孪生等场景提供通用智能算力支持，旨在为数据中心、智算中心和元计算中心的建设构建坚实算力基础，助力元宇宙中多元应用创新和落地。
+
+MUSA软件栈通过musify CUDA代码迁移工具、计算/通信加速库、mcc编译器、musa运行时和驱动实现对CUDA生态的兼容，帮助用户快速完成代码及应用的迁移。通过torch_musa插件，可以实现MTT S系列GPU对原生PyTroch的对接，用户可以无感的把AI模型运行在摩尔线程全功能GPU上。
+
+# FlagPerf适配验证环境说明
+## 环境配置参考
+  - 硬件
+    - 机器型号： MCCX D800
+    - 加速卡型号: MTT S3000 32GB
+    - CPU型号：Intel(R) Xeon(R) Gold 6330 CPU @ 2.00GHz
+    - 多机网络类型、带宽: InfiniBand，2*200Gbps
+  - 软件
+    - OS版本：Ubuntu 20.04 LTS
+    - OS kernel版本: 5.4.0-154-generic
+    - 加速卡驱动版本：2.2.0
+    - Docker 版本: PyTroch2.0_musa1.4_ec6a747fd342 
+
+## 容器镜像信息
+- 容器构建信息
+  - Dockerfile路径：training/mthreads/docker_image/pytorch_2.0/Dockerfile
+  - 构建后软件安装脚本: training/mthreads/docker_image/pytorch_2.0/pytorch_2.0_install.sh
+
+- 核心软件信息
+
+  - AI框架&版本
+    - PyTorch: v2.0.0
+
+  - 其它软件版本
+    - torch_musa: 2.0.0+git8ea3501
+    - musa toolkits: 1.4.0+git4e25703
+    - mcc: 1.4.0+git5a5bcc07
+    - mublas: 1.1.0+gite484aa2
+
+
+## 加速卡监控采集
+- 加速卡使用信息采集命令
+
+  ```bash
+  mthreads-gmi -q | grep -E 'GPU Current Temp|Power Draw|Used|Total|Gpu' | \
+  awk -F ': *' '/GPU Current Temp|Power Draw|Used|Total|Gpu/ \
+  { values[(NR-1)%5+1] = $2; } NR % 5 == 0 { print values[4], values[5], values[2], values[1], values[3]; }'
+  ```
+- 监控项示例：
+    ```bash
+    45C 109.51W 1MiB 32768MiB 0%
+    44C 108.95W 1MiB 32768MiB 0%
+    46C 110.87W 1MiB 32768MiB 0%
+    43C 104.33W 1MiB 32768MiB 0%
+    44C 107.55W 8MiB 32768MiB 0%
+    46C 110.51W 8MiB 32768MiB 0%
+    44C 106.59W 8MiB 32768MiB 0%
+    44C 104.58W 8MiB 32768MiB 0%
+    ```
+- 加速卡使用信息采集项说明
+
+|监控项| 日志文件 | 格式 |
+|---|---|---|
+|温度| mthreads_monitor.log | xxx C |
+|功耗 |mthreads_monitor.log | xxx W |
+|显存占用大小 |mthreads_monitor.log |xxx MiB |
+|总显存大小 |mthreads_monitor.log |xxx MiB |
+|显存使用率 |mthreads_monitor.log |xxx % |
+
diff --git a/training/mthreads/docker_image/pytorch_2.0/Dockerfile b/training/mthreads/docker_image/pytorch_2.0/Dockerfile
@@ -0,0 +1,4 @@
+# TODO modify Dockerfile when ready to publish
+FROM sh-harbor.mthreads.com/mt-ai/flagperf-musa-pytorch:test_git8ea3501_env0.16
+
+ENV PATH /opt/conda/envs/py38/bin:$PATH
diff --git a/training/mthreads/docker_image/pytorch_2.0/pytorch_install.sh b/training/mthreads/docker_image/pytorch_2.0/pytorch_install.sh
@@ -0,0 +1 @@
+#!/bin/bash
diff --git a/training/mthreads/mthreads_monitor.py b/training/mthreads/mthreads_monitor.py
@@ -0,0 +1,258 @@
+# ！/usr/bin/env python3
+# encoding: utf-8
+'''
+Usage:  python3 sys-monitor.py -o operation -l [log_path]
+            -o, --operation     start|stop|restart|status
+            -l, --log           log path , ./logs/ default
+'''
+
+import os
+import sys
+import time
+import signal
+import atexit
+import argparse
+import datetime
+from multiprocessing import Process
+import subprocess
+import schedule
+
+
+class Daemon:
+    '''
+    daemon subprocess class.
+    usage: subclass this daemon and override the run() method.
+    sys-monitor.pid: in the /tmp/, auto del when unexpected exit.
+    verbose: debug mode, disabled default.
+    '''
+
+    def __init__(self,
+                 pid_file,
+                 log_file,
+                 err_file,
+                 gpu_log,
+                 log_path,
+                 rate=5,
+                 stdin=os.devnull,
+                 stdout=os.devnull,
+                 stderr=os.devnull,
+                 home_dir='.',
+                 umask=0o22,
+                 verbose=0):
+        self.stdin = stdin
+        self.stdout = stdout
+        self.stderr = stderr
+        self.home_dir = home_dir
+        self.verbose = verbose
+        self.pidfile = pid_file
+        self.logfile = log_file
+        self.errfile = err_file
+        self.gpufile = gpu_log
+        self.logpath = log_path
+        self.rate = rate
+        self.umask = umask
+        self.verbose = verbose
+        self.daemon_alive = True
+
+    def get_pid(self):
+        try:
+            with open(self.pidfile, 'r') as pf:
+                pid = int(pf.read().strip())
+        except IOError:
+            pid = None
+        except SystemExit:
+            pid = None
+        return pid
+
+    def del_pid(self):
+        if os.path.exists(self.pidfile):
+            os.remove(self.pidfile)
+
+    def run(self):
+        '''
+        NOTE: override the method in subclass
+        '''
+
+        def gpu_mon(file):
+            TIMESTAMP = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
+            # TODO more elegant way?
+            cmd = "mthreads-gmi -q | grep -E 'GPU Current Temp|Power Draw|Used|Total|Gpu' | "
+            cmd += "awk -F ': *' '/GPU Current Temp|Power Draw|Used|Total|Gpu/ { values[(NR-1)%5+1] = $2; } NR % 5 == 0 { print values[4], values[5], values[2], values[1], values[3]; }'"
+            process = subprocess.Popen(cmd,
+                                       shell=True,
+                                       stdout=subprocess.PIPE,
+                                       stderr=subprocess.STDOUT,
+                                       encoding='utf-8')
+            try:
+                out = process.communicate(timeout=10)
+            except subprocess.TimeoutExpired:
+                process.kill()
+                out = process.communicate()
+
+            if process.returncode != 0:
+                result = "error"
+            result = TIMESTAMP + "\n" + out[0] + "\n"
+            with open(file, 'a') as f:
+                f.write(result)
+
+        def timer_gpu_mon():
+            gpu_process = Process(target=gpu_mon, args=(self.gpufile, ))
+            gpu_process.start()
+
+        schedule.every(self.rate).seconds.do(timer_gpu_mon)
+        while True:
+            schedule.run_pending()
+            time.sleep(5)
+
+    def daemonize(self):
+        if self.verbose >= 1:
+            print('daemon process starting ...')
+        try:
+            pid = os.fork()
+            if pid > 0:
+                sys.exit(0)
+        except OSError as e:
+            sys.stderr.write('fork #1 failed: %d (%s)\n' %
+                             (e.errno, e.strerror))
+            sys.exit(1)
+        os.chdir(self.home_dir)
+        os.setsid()
+        os.umask(self.umask)
+        try:
+            pid = os.fork()
+            if pid > 0:
+                sys.exit(0)
+        except OSError as e:
+            sys.stderr.write('fork #2 failed: %d (%s)\n' %
+                             (e.errno, e.strerror))
+            sys.exit(1)
+        sys.stdout.flush()
+        sys.stderr.flush()
+        si = open(self.stdin, 'r')
+        so = open(self.stdout, 'a+')
+        if self.stderr:
+            se = open(self.stderr, 'a+')
+        else:
+            se = so
+        os.dup2(si.fileno(), sys.stdin.fileno())
+        os.dup2(so.fileno(), sys.stdout.fileno())
+        os.dup2(se.fileno(), sys.stderr.fileno())
+        atexit.register(self.del_pid)
+        pid = str(os.getpid())
+        with open(self.pidfile, 'w+') as f:
+            f.write('%s\n' % pid)
+
+    def start(self):
+        if not os.path.exists(self.logpath):
+            os.makedirs(self.logpath)
+        elif os.path.exists(self.gpufile):
+            os.remove(self.gpufile)
+        if self.verbose >= 1:
+            print('ready to start ......')
+        # check for a pid file to see if the daemon already runs
+        pid = self.get_pid()
+        if pid:
+            msg = 'pid file %s already exists, is it already running?\n'
+            sys.stderr.write(msg % self.pidfile)
+            sys.exit(1)
+        # start the daemon
+        self.daemonize()
+        self.run()
+
+    def stop(self):
+        if self.verbose >= 1:
+            print('stopping ...')
+        pid = self.get_pid()
+        if not pid:
+            msg = 'pid file [%s] does not exist. Not running?\n' % self.pidfile
+            sys.stderr.write(msg)
+            if os.path.exists(self.pidfile):
+                os.remove(self.pidfile)
+            return
+        # try to kill the daemon process
+        try:
+            i = 0
+            while 1:
+                os.kill(pid, signal.SIGTERM)
+                time.sleep(1)
+                i = i + 1
+                if i % 10 == 0:
+                    os.kill(pid, signal.SIGHUP)
+        except OSError as err:
+            err = str(err)
+            if err.find('No such process') > 0:
+                if os.path.exists(self.pidfile):
+                    os.remove(self.pidfile)
+            else:
+                print(str(err))
+                sys.exit(1)
+            if self.verbose >= 1:
+                print('Stopped!')
+
+    def restart(self):
+        self.stop()
+        self.start()
+
+    def status(self):
+        pid = self.get_pid()
+        if pid:
+            if os.path.exists('/proc/%d' % pid):
+                return pid
+        return False
+
+
+def parse_args():
+    ''' Check script input parameter. '''
+    parse = argparse.ArgumentParser(description='Sys monitor script')
+    parse.add_argument('-o',
+                       type=str,
+                       metavar='[operation]',
+                       required=True,
+                       help='start|stop|restart|status')
+    parse.add_argument('-l',
+                       type=str,
+                       metavar='[log_path]',
+                       required=False,
+                       default='./logs/',
+                       help='log path')
+    args = parse.parse_args()
+    return args
+
+
+def main():
+    sample_rate1 = 5
+    args = parse_args()
+    operation = args.o
+    log_path = args.l
+    pid_fn = str('/tmp/gpu_monitor.pid')
+    log_fn = str(log_path + '/mthreads_monitor.log')
+    err_fn = str(log_path + '/mthreads_monitor.err')
+    # result for gpu
+    gpu_fn = str(log_path + '/mthreads_monitor.log')
+
+    subdaemon = Daemon(pid_fn,
+                       log_fn,
+                       err_fn,
+                       gpu_fn,
+                       log_path,
+                       verbose=1,
+                       rate=sample_rate1)
+    if operation == 'start':
+        subdaemon.start()
+    elif operation == 'stop':
+        subdaemon.stop()
+    elif operation == 'restart':
+        subdaemon.restart()
+    elif operation == 'status':
+        pid = subdaemon.status()
+        if pid:
+            print('process [%s] is running ......' % pid)
+        else:
+            print('daemon process [%s] stopped' % pid)
+    else:
+        print("invalid argument!")
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()