Merge branch 'main' into moflow

FlagOpen · Mar 5, 2024 · e019f7a · e019f7a
2 parents 285bb7f + 9b0280a
commit e019f7a
Show file tree

Hide file tree

Showing 41 changed files with 1,219 additions and 4 deletions.
diff --git a/inference/benchmarks/stable_diffusion_v1_4/README.md b/inference/benchmarks/stable_diffusion_v1_4/README.md
@@ -59,3 +59,5 @@
 | kunlunxin_xtcl | fp32   | 2 | / | / | / | / | / | / | 26.524/25.3 | 0.07/32.0 |
 | null | fp16 | 16 | / | 11.7 | 60.7 | /  | / | 13.2% | -/25.2 | 5.7/40.0 |
 | null | fp32 | 8 | / | 9.3 | 27.3 | /  | / | 11.9% | -/25.3 | 6.3/40.0 |
+| metax-nocompiler | fp16 | 64 | / | / | / | /  | / | 12.7% | -/25.4 | 14.7/64.0 |
+| metax-nocompiler | fp32 | 16 | / | / | / | /  | / | 10.3% | -/25.4 | 55.57/64.0 |
diff --git a/inference/configs/host.yaml b/inference/configs/host.yaml
@@ -8,10 +8,13 @@ SSH_PORT: "22"
 HOSTS_PORTS: ["2222"]
 MASTER_PORT: "29501"
 SHM_SIZE: "32G"
+#   metax:
+#       " --device=/dev/dri --device=/dev/mxcd --group-add video"
 ACCE_CONTAINER_OPT: " --gpus all"
 PIP_SOURCE: "https://mirror.baidu.com/pypi/simple"
 CLEAR_CACHES: True
 ACCE_VISIBLE_DEVICE_ENV_NAME: "CUDA_VISIBLE_DEVICES"
 CASES: 
     # "resnet50:pytorch_1.13": "/raid/dataset/ImageNet/imagenet/val"
-    "vit_l_16:pytorch_2.1": "/raid/dataset/ImageNet_1k_2012/val"
+    "vit_l_16:pytorch_2.1": "/raid/dataset/ImageNet_1k_2012/val"
+    #"stable_diffusion_v1_4:pytorch_2.0": "/raid/dataset/stable_diffusion_v1_4/"
diff --git a/inference/docker_images/metax/metax_analysis.py b/inference/docker_images/metax/metax_analysis.py
@@ -0,0 +1,17 @@
+def analysis_log(logpath):
+    logfile = open(logpath)
+
+    max_usage = 0.0
+    max_mem = 0.0
+    for line in logfile.readlines():
+        if "MiB" in line:
+
+            usage_and_maxusage = line.split(" ")[2]
+            usage = float(usage_and_maxusage.split("/")[0])
+            max_usage = max(max_usage, usage)
+            max_mem = float(usage_and_maxusage.split("/")[1])
+            #max_mem = float(max_mem[:-3])
+            print (max_mem)
+            print (max_usage)
+    return round(max_usage / 1024.0,
+                 2), round(max_mem / 1024.0, 2), eval("120e12"), eval("240e12")
diff --git a/inference/docker_images/metax/metax_monitor.py b/inference/docker_images/metax/metax_monitor.py
@@ -0,0 +1,256 @@
+# ！/usr/bin/env python3
+# encoding: utf-8
+'''
+Usage:  python3 sys-monitor.py -o operation -l [log_path]
+            -o, --operation     start|stop|restart|status
+            -l, --log           log path , ./logs/ default
+'''
+
+import os
+import sys
+import time
+import signal
+import atexit
+import argparse
+import datetime
+from multiprocessing import Process
+import subprocess
+import schedule
+
+
+class Daemon:
+    '''
+    daemon subprocess class.
+    usage: subclass this daemon and override the run() method.
+    sys-monitor.pid: in the /tmp/, auto del when unexpected exit.
+    verbose: debug mode, disabled default.
+    '''
+
+    def __init__(self,
+                 pid_file,
+                 log_file,
+                 err_file,
+                 gpu_log,
+                 log_path,
+                 rate=5,
+                 stdin=os.devnull,
+                 stdout=os.devnull,
+                 stderr=os.devnull,
+                 home_dir='.',
+                 umask=0o22,
+                 verbose=0):
+        self.stdin = stdin
+        self.stdout = stdout
+        self.stderr = stderr
+        self.home_dir = home_dir
+        self.verbose = verbose
+        self.pidfile = pid_file
+        self.logfile = log_file
+        self.errfile = err_file
+        self.gpufile = gpu_log
+        self.logpath = log_path
+        self.rate = rate
+        self.umask = umask
+        self.verbose = verbose
+        self.daemon_alive = True
+
+    def get_pid(self):
+        try:
+            with open(self.pidfile, 'r') as pf:
+                pid = int(pf.read().strip())
+        except IOError:
+            pid = None
+        except SystemExit:
+            pid = None
+        return pid
+
+    def del_pid(self):
+        if os.path.exists(self.pidfile):
+            os.remove(self.pidfile)
+
+    def run(self):
+        '''
+        NOTE: override the method in subclass
+        '''
+
+        def gpu_mon(file):
+            TIMESTAMP = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
+            cmd = "mx-smi |grep 'W' -m 1 | awk '{print $2, $3, $5,$6}' && mx-smi |grep 'MXC' -m 1 | awk '{print $7}'"
+            process = subprocess.Popen(cmd,
+                                       shell=True,
+                                       stdout=subprocess.PIPE,
+                                       stderr=subprocess.STDOUT,
+                                       encoding='utf-8')
+            try:
+                out = process.communicate(timeout=10)
+            except subprocess.TimeoutExpired:
+                process.kill()
+                out = process.communicate()
+
+            if process.returncode != 0:
+                result = "error"
+            result = TIMESTAMP + "\n" + out[0] + "\n"
+            with open(file, 'a') as f:
+                f.write(result)
+
+        def timer_gpu_mon():
+            gpu_process = Process(target=gpu_mon, args=(self.gpufile, ))
+            gpu_process.start()
+
+        schedule.every(self.rate).seconds.do(timer_gpu_mon)
+        while True:
+            schedule.run_pending()
+            time.sleep(5)
+
+    def daemonize(self):
+        if self.verbose >= 1:
+            print('daemon process starting ...')
+        try:
+            pid = os.fork()
+            if pid > 0:
+                sys.exit(0)
+        except OSError as e:
+            sys.stderr.write('fork #1 failed: %d (%s)\n' %
+                             (e.errno, e.strerror))
+            sys.exit(1)
+        os.chdir(self.home_dir)
+        os.setsid()
+        os.umask(self.umask)
+        try:
+            pid = os.fork()
+            if pid > 0:
+                sys.exit(0)
+        except OSError as e:
+            sys.stderr.write('fork #2 failed: %d (%s)\n' %
+                             (e.errno, e.strerror))
+            sys.exit(1)
+        sys.stdout.flush()
+        sys.stderr.flush()
+        si = open(self.stdin, 'r')
+        so = open(self.stdout, 'a+')
+        if self.stderr:
+            se = open(self.stderr, 'a+')
+        else:
+            se = so
+        os.dup2(si.fileno(), sys.stdin.fileno())
+        os.dup2(so.fileno(), sys.stdout.fileno())
+        os.dup2(se.fileno(), sys.stderr.fileno())
+        atexit.register(self.del_pid)
+        pid = str(os.getpid())
+        with open(self.pidfile, 'w+') as f:
+            f.write('%s\n' % pid)
+
+    def start(self):
+        if not os.path.exists(self.logpath):
+            os.makedirs(self.logpath)
+        elif os.path.exists(self.gpufile):
+            os.remove(self.gpufile)
+        if self.verbose >= 1:
+            print('ready to start ......')
+        # check for a pid file to see if the daemon already runs
+        pid = self.get_pid()
+        if pid:
+            msg = 'pid file %s already exists, is it already running?\n'
+            sys.stderr.write(msg % self.pidfile)
+            sys.exit(1)
+        # start the daemon
+        self.daemonize()
+        self.run()
+
+    def stop(self):
+        if self.verbose >= 1:
+            print('stopping ...')
+        pid = self.get_pid()
+        if not pid:
+            msg = 'pid file [%s] does not exist. Not running?\n' % self.pidfile
+            sys.stderr.write(msg)
+            if os.path.exists(self.pidfile):
+                os.remove(self.pidfile)
+            return
+        # try to kill the daemon process
+        try:
+            i = 0
+            while 1:
+                os.kill(pid, signal.SIGTERM)
+                time.sleep(1)
+                i = i + 1
+                if i % 10 == 0:
+                    os.kill(pid, signal.SIGHUP)
+        except OSError as err:
+            err = str(err)
+            if err.find('No such process') > 0:
+                if os.path.exists(self.pidfile):
+                    os.remove(self.pidfile)
+            else:
+                print(str(err))
+                sys.exit(1)
+            if self.verbose >= 1:
+                print('Stopped!')
+
+    def restart(self):
+        self.stop()
+        self.start()
+
+    def status(self):
+        pid = self.get_pid()
+        if pid:
+            if os.path.exists('/proc/%d' % pid):
+                return pid
+        return False
+
+
+def parse_args():
+    ''' Check script input parameter. '''
+    parse = argparse.ArgumentParser(description='Sys monitor script')
+    parse.add_argument('-o',
+                       type=str,
+                       metavar='[operation]',
+                       required=True,
+                       help='start|stop|restart|status')
+    parse.add_argument('-l',
+                       type=str,
+                       metavar='[log_path]',
+                       required=False,
+                       default='./logs/',
+                       help='log path')
+    args = parse.parse_args()
+    return args
+
+
+def main():
+    sample_rate1 = 5
+    args = parse_args()
+    operation = args.o
+    log_path = args.l
+    pid_fn = str('/tmp/gpu_monitor.pid')
+    log_fn = str(log_path + '/metax_monitor.log')
+    err_fn = str(log_path + '/metax_monitor.err')
+    # result for gpu
+    gpu_fn = str(log_path + '/metax_monitor.log')
+
+    subdaemon = Daemon(pid_fn,
+                       log_fn,
+                       err_fn,
+                       gpu_fn,
+                       log_path,
+                       verbose=1,
+                       rate=sample_rate1)
+    if operation == 'start':
+        subdaemon.start()
+    elif operation == 'stop':
+        subdaemon.stop()
+    elif operation == 'restart':
+        subdaemon.restart()
+    elif operation == 'status':
+        pid = subdaemon.status()
+        if pid:
+            print('process [%s] is running ......' % pid)
+        else:
+            print('daemon process [%s] stopped' % pid)
+    else:
+        print("invalid argument!")
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/inference/docker_images/metax/pytorch_2.0/Dockerfile b/inference/docker_images/metax/pytorch_2.0/Dockerfile
@@ -0,0 +1,11 @@
+FROM mxcr.io/library/maca-c500-pytorch:2.19.2.5-ubuntu18.04-amd64
+ENV PATH="/opt/conda/bin:${PATH}"
+ENV PYTORCH_USE_FLASHATTN=1
+RUN pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
+RUN pip install numpy
+RUN pip install pyyaml
+RUN pip install timm
+RUN pip install munch
+RUN pip install schedule
+RUN pip install loguru
+RUN /bin/bash -c "uname -a"
diff --git a/training/benchmarks/llama2_70B/megatron/megatron_main.sh b/training/benchmarks/llama2_70B/megatron/megatron_main.sh
@@ -131,8 +131,10 @@ LOGGING_ARGS="
     --log-interval 1
 "
 
+CODE_PATH="/workspace/FlagScale/pretrain_llama.py"
+
 source $VENDOR_SHELL
-cmd="torchrun $DISTRIBUTED_ARGS /workspace/FlagScale/pretrain_llama.py \
+cmd="torchrun $DISTRIBUTED_ARGS $CODE_PATH \
               $TRAINING_ARGS \
               $MIXED_PRECISION_ARGS \
               $DATA_ARGS \

diff --git a/training/kunlunxin/docker_image/megatron/Dockerfile b/training/kunlunxin/docker_image/megatron/Dockerfile
@@ -0,0 +1,6 @@
+FROM iregistry.baidu-int.com/xmlir/xmlir_ubuntu_2004_x86_64:v0.27
+RUN /bin/bash -c "pip config set global.index-url https://mirror.baidu.com/pypi/simple"
+RUN /bin/bash -c "uname -a"
+RUN /bin/bash -c alias python3=python
+
+ENV PATH /root/miniconda/envs/python38_torch201_cuda/bin:$PATH
diff --git a/training/kunlunxin/docker_image/megatron/megatron_install.sh b/training/kunlunxin/docker_image/megatron/megatron_install.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+# using github mirrors to avoid github TTL
+#export https_proxy=http://10.1.0.34:7890
+git clone https://githubfast.com/FlagOpen/FlagScale
+cd FlagScale
+
+git checkout eb0438a5459404e2e4c70b15fa37e9a197ab159d
+echo 'export PYTHONPATH=$PYTHONPATH:/home/FlagScale' >> /root/.bashrc
+source /root/.bashrc
+
+wget https://bd.bcebos.com/v1/klx-pytorch-work-bd/training/zhangling21_llama70B/xmlir201_5.run
+bash xmlir201_5.run
+XFLAGS --enable transformer_engine
+XFLAGS --enable flagscale
diff --git a/training/kunlunxin/llama2_70B-megatron/README.md b/training/kunlunxin/llama2_70B-megatron/README.md
@@ -0,0 +1,49 @@
+### 昆仑芯XPU配置与运行信息参考
+#### 环境配置
+- ##### 硬件环境
+  - 机器型号: 昆仑芯AI加速器组R480-X8
+  - 加速卡型号: 昆仑芯AI加速卡R300
+  - 多机网络类型、带宽: InfiniBand，200Gb/s
+
+- ##### 软件环境
+  - OS版本：Ubuntu 20.04
+  - OS kernel版本: 5.4.0-26-generic
+  - 加速卡驱动版本：4.0.25
+  - Docker镜像和版本：iregistry.baidu-int.com/xmlir/xmlir_ubuntu_2004_x86_64:v0.27
+  - 训练框架版本：xmlir
+  - 训练编译器版本：xacc
+  - 依赖软件版本：pytorch-2.0.1
+
+
+### 运行情况
+
+* 输入批尺寸
+  1. local_batchsize(micro_batchsize)，简写为LBS，即实际进入模型的张量批尺寸，为config_H100x4x8.py中所写，在本case中默认为1
+  2. seqlength(max_position_embedding)，简写为MPE，即实际进入模型的序列长度，为config_H100x4x8.py中所写，在本case中默认为4096
+  3. gradient_accumulate_steps，简写为GAS，即梯度累加步数，为ds_config.json中所写，在本case中默认为44
+  4. global_batchsize恒等于local_batchsize\*gradient_accumulate_steps\*data_parallel_size。在本case中，data_parallel_size=world_size/TPsize/PPsize。
+
+* 通用指标
+
+| 指标名称     | 指标值                     | 特殊说明                           |
+| ------------ | -------------------------- | ---------------------------------- |
+| 任务类别     | 自然语言理解               |                                    |
+| 模型         | llama2_70b                  |                                    |
+| 数据集       | pile wikipedia   |                                    |
+| 数据精度       | precision,见“性能指标”  | 可选fp32/amp/fp16/bf16                      |
+| 超参修改     | parallel,见“性能指标” | 格式为TPxPPyDPz，例如TP2PP1DP4 |
+| 超参修改     | fix_hp,见“性能指标”        | 跑满硬件设备评测吞吐量所需特殊超参 |
+| 硬件设备简称 | nvidia H800                |                                    |
+| 硬件存储使用 | mem,见“性能指标”           | 通常称为“显存”,单位为GiB           |
+| 计算使用率 | MFU,见“性能指标”           | 参见PaLM论文定义 |
+| **吞吐量**   | **token/p/s,见“性能指标”** | 平均单卡每秒处理的token数          |
+
+* 性能指标
+
+值得注意的是，下列第4组实验的global_batchsize与llama2原始论文相同, 训练100 step，此项实验也将作为精度对齐所用实验。
+
+| 配置                | precision | parallel  | fix_hp                       | token/p/s | 是否精度对齐 | mem   | MFU |
+| ------------------- | --------- | --------- | ---------------------------- | --------- | ----- | ----- | --- |
+| R300十机80卡（10x8） | fp32      | TP8PP10DP1 | /                            | /         | /     | 21/32 | /   |
+| R300十机80卡（10x8） | amp       | TP8PP10DP1 | GAS=1024(GBS=1024=4M tokens) | /         | doing* | 21/32 | /   |
+因缺少R300机器，在单卡R300与单卡GPU上初步验证精度。目前已通过减小模型层数的方式，在单卡R300与单卡GPU上验证精度。完整70B模型的精度验证进行中。