From 71813f73701e89135b845b2d6f1e18377fa8c232 Mon Sep 17 00:00:00 2001
From: FaJingyi <jingyi.fa@gmail.com>
Date: Tue, 5 Mar 2024 11:15:43 +0800
Subject: [PATCH 1/4] [metax] swintransformer-inference pr (#473)

* add metax swin-transformer

* mod readme

* mod readme

* mod swin

* Update README.md

* Update config_common.py

* Update requirements.txt

* fix torch_six in swin_transformer

* Update utils.py

* add metax swintrans-infer

---------

Co-authored-by: jingyifa <jingyi.fa@metax-tech.com>
---
 inference/benchmarks/swinTransformer/README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/inference/benchmarks/swinTransformer/README.md b/inference/benchmarks/swinTransformer/README.md
index 14304fed9..ba517c573 100644
--- a/inference/benchmarks/swinTransformer/README.md
+++ b/inference/benchmarks/swinTransformer/README.md
@@ -84,4 +84,5 @@ find ./val -name "*JPEG" | wc -l
 | ----------- | --------- | ---- | ---- | -------- | ----------- | ---------- | ------------- | ------------ | ----------- | ----------- |
 | tensorrt | fp16      | 512 |1011.7 | 1347.5 | 1511.3 | 1231.7 | 1359.1 | 6.8% | 81.7/83.2 | 19.9/40.0 |
 | tensorrt | fp32   | 256 | 856.9 | 761.5 | 794.3 | 789.2 | 826.4 | 8.2% | 83.2/83.2 | 20.0/40.0 |
-| kunlunxin_xtcl| W32A16   | 256 | 543.745 | / | / | / | / | / | 0.832 | / |
+| kunlunxin_xtcl| W32A16   | 256 | / | / | / | / | / | / | 0.832 | / |
+| metax-nocompiler| fp16   | 512 | / | / | / | / | / | 6.5% | 0.832 |10.6/64.0 |

From 61cd337663c1578455f2b72b63ce37640dfa45e7 Mon Sep 17 00:00:00 2001
From: Rayyyyy <109121546+RRRRRayyyyy@users.noreply.github.com>
Date: Tue, 5 Mar 2024 16:54:39 +0800
Subject: [PATCH 2/4] [DCU]Add glm case of dcu in FlagPerf. (#472)

* Add glm case of dcu in Flagperf.

* update 1*1 log

* Update README infos in glm_pytorch of DCU.

---------

Co-authored-by: shh2000 <13820618441@163.com>
---
 training/benchmarks/driver/helper.py          |   6 +
 training/dcu/README.md                        |  66 ++++
 training/dcu/dcu_monitor.py                   | 289 +++++++++++++++++
 .../dcu/docker_image/pytorch_1.13/Dockerfile  |   5 +
 .../pytorch_1.13/pytorch1.13_install.sh       |  10 +
 training/dcu/glm-pytorch/README.md            |  48 +++
 .../dcu/glm-pytorch/config/config_K100x1x1.py |  19 ++
 .../dcu/glm-pytorch/config/config_K100x1x8.py |  18 ++
 .../config/environment_variables.sh           |   6 +
 .../dcu/glm-pytorch/config/requirements.txt   |   3 +
 training/dcu/glm-pytorch/extern/converter.py  |  21 ++
 .../dcu/glm-pytorch/extern/layers/__init__.py |   1 +
 .../glm-pytorch/extern/layers/layernorm.py    |   1 +
 .../glm-pytorch/extern/layers/transformer.py  | 298 ++++++++++++++++++
 .../extern/layers/transformer_block.py        | 125 ++++++++
 .../dcu/glm-pytorch/extern/trainer_adapter.py |  80 +++++
 training/run_benchmarks/config/test_conf.py   |  13 +-
 17 files changed, 1006 insertions(+), 3 deletions(-)
 create mode 100644 training/dcu/README.md
 create mode 100644 training/dcu/dcu_monitor.py
 create mode 100644 training/dcu/docker_image/pytorch_1.13/Dockerfile
 create mode 100644 training/dcu/docker_image/pytorch_1.13/pytorch1.13_install.sh
 create mode 100644 training/dcu/glm-pytorch/README.md
 create mode 100644 training/dcu/glm-pytorch/config/config_K100x1x1.py
 create mode 100644 training/dcu/glm-pytorch/config/config_K100x1x8.py
 create mode 100644 training/dcu/glm-pytorch/config/environment_variables.sh
 create mode 100644 training/dcu/glm-pytorch/config/requirements.txt
 create mode 100644 training/dcu/glm-pytorch/extern/converter.py
 create mode 100644 training/dcu/glm-pytorch/extern/layers/__init__.py
 create mode 100644 training/dcu/glm-pytorch/extern/layers/layernorm.py
 create mode 100644 training/dcu/glm-pytorch/extern/layers/transformer.py
 create mode 100644 training/dcu/glm-pytorch/extern/layers/transformer_block.py
 create mode 100644 training/dcu/glm-pytorch/extern/trainer_adapter.py

diff --git a/training/benchmarks/driver/helper.py b/training/benchmarks/driver/helper.py
index 545e83f19..338b3d8ac 100644
--- a/training/benchmarks/driver/helper.py
+++ b/training/benchmarks/driver/helper.py
@@ -88,6 +88,12 @@ def set_seed(self, seed: int, vendor: str = None):
             torch.backends.cudnn.benchmark = getattr(config, "cudnn_benchmark")
             torch.backends.cudnn.deterministic = getattr(
                 config, "cudnn_deterministic")
+        elif lower_vendor == "dcu":
+            import torch
+            torch.manual_seed(seed)
+            torch.cuda.manual_seed(seed)
+            torch.cuda.manual_seed_all(seed)
+            torch.backends.cudnn.benchmark = True
         else:
             # TODO 其他厂商设置seed，在此扩展
             pass
diff --git a/training/dcu/README.md b/training/dcu/README.md
new file mode 100644
index 000000000..93478c9a7
--- /dev/null
+++ b/training/dcu/README.md
@@ -0,0 +1,66 @@
+# 厂商信息
+
+海光 DCU 系列产品以 GPGPU 架构为基础，兼容通用的“类 CUDA” 环境以及国际主流商业计算软件和人工智能软件，软硬件生态丰富，可广泛应用于大数据处理、人工智能、商业计算等应用领域。
+
+海光 DCU 兼容“类 CUDA” 环境， 软硬件生态丰富，典型应用场景下性能指标达到国际上同类型高端产品的水平。
+
+海光 DCU 主要面向大数据处理、商业计算等计算密集型应用领域，以及人工智能、 泛人工智能类运算加速领域。
+
+# FlagPerf适配验证环境说明
+## 环境配置参考
+  - 硬件
+    - 机器型号：K100 标准机
+    - 加速卡型号: K100 64G
+  - 软件
+    - OS版本：centos 7.6
+    - OS kernel版本: 4.18.0-348.el8.0.2.x86_64
+    - Docker 版本: 24.0.7
+
+## 容器镜像信息
+- 容器构建信息
+  - Dockerfile路径：training/dcu/docker_image/\<framework\>/Dockerfile
+  - 构建后软件安装脚本: training/dcu/docker_image/\<framework\>/\<framework\>_install.sh
+
+- 核心软件信息
+
+  - AI框架&版本
+    - torch: 1.13.1
+
+  - 其它软件版本
+    - dtk: 23.10.1
+
+
+## 加速卡监控采集
+- 加速卡使用信息采集命令
+
+  dcu_monitor.py中79行需要修改为实际source的地址
+
+  ```
+  source /path/of/dtk/env.sh
+  rocm-smi
+  ```
+
+- 监控项示例：
+
+    ```
+    ============================ System Management Interface =============================
+    ======================================================================================
+    DCU     Temp     AvgPwr     Perf     PwrCap     VRAM%      DCU%      Mode     
+    0       53.0C    96.0W      auto     300.0W     0%         0%        Normal   
+    1       53.0C    96.0W      auto     300.0W     0%         0%        Normal   
+    2       54.0C    95.0W      auto     300.0W     0%         0%        Normal   
+    3       55.0C    96.0W      auto     300.0W     0%         0%        Normal   
+    4       54.0C    97.0W      auto     300.0W     0%         0%        Normal   
+    5       54.0C    95.0W      auto     300.0W     0%         0%        Normal   
+    6       55.0C    93.0W      auto     300.0W     0%         0%        Normal   
+    7       54.0C    96.0W      auto     300.0W     0%         0%        Normal   
+    ======================================================================================
+    =================================== End of SMI Log ===================================
+    ```
+
+- 加速卡使用信息采集项说明
+
+|监控项| 日志文件 |
+|---|---|
+|VRAM(%) | dcu_monitor.log |
+|DCU(%) | dcu_monitor.log |
\ No newline at end of file
diff --git a/training/dcu/dcu_monitor.py b/training/dcu/dcu_monitor.py
new file mode 100644
index 000000000..4572a082f
--- /dev/null
+++ b/training/dcu/dcu_monitor.py
@@ -0,0 +1,289 @@
+# ！/usr/bin/env python3
+# encoding: utf-8
+'''
+Usage:  python3 sys-monitor.py -o operation -l [log_path]
+            -o, --operation     start|stop|restart|status
+            -l, --log           log path , ./logs/ default
+'''
+
+import os
+import sys
+import time
+import signal
+import atexit
+import argparse
+import datetime
+from multiprocessing import Process
+import subprocess
+import schedule
+
+
+class Daemon:
+    '''
+    daemon subprocess class.
+    usage: subclass this daemon and override the run() method.
+    sys-monitor.pid: in the /tmp/, auto del when unexpected exit.
+    verbose: debug mode, disabled default.
+    '''
+
+    def __init__(self,
+                 pid_file,
+                 log_file,
+                 err_file,
+                 dcu_log,
+                 log_path,
+                 rate=5,
+                 stdin=os.devnull,
+                 stdout=os.devnull,
+                 stderr=os.devnull,
+                 home_dir='.',
+                 umask=0o22,
+                 verbose=0):
+        self.stdin = stdin
+        self.stdout = stdout
+        self.stderr = stderr
+        self.home_dir = home_dir
+        self.verbose = verbose
+        self.pidfile = pid_file
+        self.logfile = log_file
+        self.errfile = err_file
+        self.dcufile = dcu_log
+        self.logpath = log_path
+        self.rate = rate
+        self.umask = umask
+        self.verbose = verbose
+        self.daemon_alive = True
+
+    def get_pid(self):
+        try:
+            with open(self.pidfile, 'r') as pf:
+                pid = int(pf.read().strip())
+        except IOError:
+            pid = None
+        except SystemExit:
+            pid = None
+        return pid
+
+    def del_pid(self):
+        if os.path.exists(self.pidfile):
+            os.remove(self.pidfile)
+
+    def run(self):
+        '''
+        NOTE: override the method in subclass
+        '''
+
+        def dcu_mon(file):
+            TIMESTAMP = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
+            # 修改本地DTK路径
+            cmd = "source /path/of/dtk/env.sh; rocm-smi"
+            process = subprocess.Popen(cmd,
+                                       shell=True,
+                                       stdout=subprocess.PIPE,
+                                       stderr=subprocess.STDOUT,
+                                       encoding='utf-8')
+            try:
+                out = process.communicate(timeout=10)
+            except subprocess.TimeoutExpired:
+                process.kill()
+                out = process.communicate()
+
+            if process.returncode != 0:
+                result = "error"
+            result = TIMESTAMP + "\n" + out[0] + "\n"
+            with open(file, 'a') as f:
+                f.write(result)
+
+        def timer_dcu_mon():
+            dcu_process = Process(target=dcu_mon, args=(self.dcufile, ))
+            dcu_process.start()
+
+        schedule.every(self.rate).seconds.do(timer_dcu_mon)
+        while True:
+            schedule.run_pending()
+            time.sleep(5)
+
+    def daemonize(self):
+        if self.verbose >= 1:
+            print('daemon process starting ...')
+        try:
+            pid = os.fork()
+            if pid > 0:
+                sys.exit(0)
+        except OSError as e:
+            sys.stderr.write('fork #1 failed: %d (%s)\n' %
+                             (e.errno, e.strerror))
+            sys.exit(1)
+        os.chdir(self.home_dir)
+        os.setsid()
+        os.umask(self.umask)
+        try:
+            pid = os.fork()
+            if pid > 0:
+                sys.exit(0)
+        except OSError as e:
+            sys.stderr.write('fork #2 failed: %d (%s)\n' %
+                             (e.errno, e.strerror))
+            sys.exit(1)
+        sys.stdout.flush()
+        sys.stderr.flush()
+        si = open(self.stdin, 'r')
+        so = open(self.stdout, 'a+')
+        if self.stderr:
+            se = open(self.stderr, 'a+')
+        else:
+            se = so
+        os.dup2(si.fileno(), sys.stdin.fileno())
+        os.dup2(so.fileno(), sys.stdout.fileno())
+        os.dup2(se.fileno(), sys.stderr.fileno())
+        atexit.register(self.del_pid)
+        pid = str(os.getpid())
+        with open(self.pidfile, 'w+') as f:
+            f.write('%s\n' % pid)
+
+    def start(self):
+        if not os.path.exists(self.logpath):
+            os.makedirs(self.logpath)
+        elif os.path.exists(self.dcufile):
+            os.remove(self.dcufile)
+        if self.verbose >= 1:
+            print('ready to start ......')
+        # check for a pid file to see if the daemon already runs
+        pid = self.get_pid()
+        if pid:
+            msg = 'pid file %s already exists, is it already running?\n'
+            sys.stderr.write(msg % self.pidfile)
+            sys.exit(1)
+        # start the daemon
+        self.daemonize()
+        self.run()
+
+    def stop(self):
+        if self.verbose >= 1:
+            print('stopping ...')
+        pid = self.get_pid()
+        if not pid:
+            msg = 'pid file [%s] does not exist. Not running?\n' % self.pidfile
+            sys.stderr.write(msg)
+            if os.path.exists(self.pidfile):
+                os.remove(self.pidfile)
+            return
+        # try to kill the daemon process
+        try:
+            i = 0
+            while 1:
+                os.kill(pid, signal.SIGTERM)
+                time.sleep(1)
+                i = i + 1
+                if i % 10 == 0:
+                    os.kill(pid, signal.SIGHUP)
+        except OSError as err:
+            err = str(err)
+            if err.find('No such process') > 0:
+                if os.path.exists(self.pidfile):
+                    os.remove(self.pidfile)
+            else:
+                print(str(err))
+                sys.exit(1)
+            if self.verbose >= 1:
+                print('Stopped!')
+
+    def restart(self):
+        self.stop()
+        self.start()
+
+    def status(self):
+        pid = self.get_pid()
+        if pid:
+            if os.path.exists('/proc/%d' % pid):
+                return pid
+        return False
+
+
+def parse_args():
+    ''' Check script input parameter. '''
+    parse = argparse.ArgumentParser(description='Sys monitor script')
+    parse.add_argument('-o',
+                       type=str,
+                       metavar='[operation]',
+                       required=True,
+                       help='start|stop|restart|status')
+    parse.add_argument('-l',
+                       type=str,
+                       metavar='[log_path]',
+                       required=False,
+                       default='./logs/',
+                       help='log path')
+    args = parse.parse_args()
+    return args
+
+
+def get_system_info():
+    cmd = r"echo OS version:;"
+    cmd = cmd + r"cat /etc/issue | head -n1 | awk '{print $1, $2, $3}';"
+    cmd = cmd + r"echo ;"
+
+    cmd = cmd + r"echo OS Kernel version:;"
+    cmd = cmd + r"uname -r;"
+    cmd = cmd + r"echo ;"
+
+    cmd = cmd + r"echo Hardware Model:;"
+    cmd = cmd + r"sudo dmidecode | grep -A9 'System Information' | tail -n +2 | sed 's/^[ \t]*//';"
+    cmd = cmd + r"echo ;"
+
+    cmd = cmd + r"echo Accelerator Model:;"
+    cmd = cmd + r"rocm-smi -L;"
+    cmd = cmd + r"echo ;"
+
+    cmd = cmd + r"echo Accelerator Driver version:;"
+    cmd = cmd + r"rocm-smi | grep 'Driver Version' | awk '{print $3}';"
+    cmd = cmd + r"echo ;"
+
+    cmd = cmd + r"echo Docker version:;"
+    cmd = cmd + r"docker -v"
+
+    return cmd
+
+
+def main():
+    sample_rate1 = 5
+    args = parse_args()
+    operation = args.o
+    log_path = args.l
+    pid_fn = str('/tmp/dcu_monitor.pid')
+    log_fn = str(log_path + '/dcu_monitor.log')
+    err_fn = str(log_path + '/dcu_monitor.err')
+    # result for dcu
+    dcu_fn = str(log_path + '/dcu_monitor.log')
+    sys_fn = str(log_path + '/sys_info.log')
+    cmd = get_system_info()
+    with open(sys_fn, "w") as f:
+        p = subprocess.Popen(cmd, shell=True, stdout=f, stderr=subprocess.STDOUT)
+        p.wait()
+
+    subdaemon = Daemon(pid_fn,
+                       log_fn,
+                       err_fn,
+                       dcu_fn,
+                       log_path,
+                       verbose=1,
+                       rate=sample_rate1)
+    if operation == 'start':
+        subdaemon.start()
+    elif operation == 'stop':
+        subdaemon.stop()
+    elif operation == 'restart':
+        subdaemon.restart()
+    elif operation == 'status':
+        pid = subdaemon.status()
+        if pid:
+            print('process [%s] is running ......' % pid)
+        else:
+            print('daemon process [%s] stopped' % pid)
+    else:
+        print("invalid argument!")
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/training/dcu/docker_image/pytorch_1.13/Dockerfile b/training/dcu/docker_image/pytorch_1.13/Dockerfile
new file mode 100644
index 000000000..47e386e7a
--- /dev/null
+++ b/training/dcu/docker_image/pytorch_1.13/Dockerfile
@@ -0,0 +1,5 @@
+FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.13.1-centos7.6-dtk23.10-py38
+RUN source /opt/dtk/env.sh
+RUN /bin/bash -c "pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple"
+RUN /bin/bash -c "uname -a"
+RUN /bin/bash -c alias python3=python
diff --git a/training/dcu/docker_image/pytorch_1.13/pytorch1.13_install.sh b/training/dcu/docker_image/pytorch_1.13/pytorch1.13_install.sh
new file mode 100644
index 000000000..237096b5c
--- /dev/null
+++ b/training/dcu/docker_image/pytorch_1.13/pytorch1.13_install.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+proxy_server_ip=10.0.35.251
+PROXY_URL="http://$proxy_server_ip:3128/"
+NO_PROXY_ADDR="127.0.0.1,localhost,.local,.cluster.local,10.0.0.0/8,172.16.0.0/12,192.168.0.0/16"
+export http_proxy="$PROXY_URL"
+export https_proxy="$PROXY_URL"
+export no_proxy="$NO_PROXY_ADDR"
+export HTTP_PROXY="$PROXY_URL"
+export HTTPS_PROXY="$PROXY_URL"
+export NO_PROXY="$NO_PROXY_ADDR"
\ No newline at end of file
diff --git a/training/dcu/glm-pytorch/README.md b/training/dcu/glm-pytorch/README.md
new file mode 100644
index 000000000..70246a4b5
--- /dev/null
+++ b/training/dcu/glm-pytorch/README.md
@@ -0,0 +1,48 @@
+### 模型Checkpoint下载
+[模型Checkpoint下载](../../benchmarks/glm/README.md#模型checkpoint)
+### 测试数据集下载
+[测试数据集下载](../../benchmarks/glm/README.md#数据集)
+
+### Nvidia GPU配置与运行信息参考
+#### 环境配置
+- ##### 硬件环境
+   - 机器型号：K100 标准机
+   - 加速卡型号: K100 64G
+   - CPU型号:
+   - 多机网络类型、带宽:
+- ##### 软件环境
+   - OS版本：centos 7.6
+   - OS kernel版本: 4.18.0-348.el8.0.2.x86_64
+   - 加速卡驱动版本：dtk-23.10.1
+   - Docker 版本：24.0.7
+   - 训练框架版本：pytorch-1.13.1+git7d2dd01.abi0.dtk2310
+   - 依赖软件版本：无
+
+
+### 运行情况
+* 通用指标
+
+| 指标名称         | 指标值                                       | 特殊说明                                    |
+| ---------------- | -------------------------------------------- | ------------------------------------------- |
+| 任务类别         | 自然语言理解、无条件文本生成、有条件文本生成 |                                             |
+| 模型             | GLM                                          |                                             |
+| 数据集           | superglue                                    |                                             |
+| 数据精度         | precision,见“性能指标”                       | 可选fp32/amp/fp16                           |
+| 超参修改         | fix_hp,见“性能指标”                          | 跑满硬件设备评测吞吐量所需特殊超参          |
+| 硬件设备简称     | DCU Z100L                                 |                                             |
+| 硬件存储使用     | mem(actual/total),见“性能指标”               | 通常称为“显存”,单位为GiB                    |
+| 端到端时间       | e2e_time,见“性能指标”                        | 总时间+Perf初始化等时间                     |
+| 总吞吐量         | p_whole,见“性能指标”                         | 实际训练样本数除以总时间(performance_whole) |
+| 训练吞吐量       | p_train,见“性能指标”                         | 不包含每个epoch末尾的评估部分耗时           |
+| **计算吞吐量**   | **p_core,见“性能指标”**                      | 不包含数据IO部分的耗时(p3>p2>p1)            |
+| **计算卡使用率** | **\*MFU**                                    | model flops utilization                     |
+| 训练结果         | acc,见“性能指标”                             | 准确率                                      |
+| 额外修改项       | 无                                           |                                             |
+
+* 性能指标
+
+| 配置                | precision | fix_hp          | e2e_time | p_whole | p_train | p_core | acc   | mem    | MFU   |
+| ------------------ | --------- | ------------- | -------- | ------- | ------- | ------ | ----- | --------- | ----- |
+| K100单机单卡（1x1）  | fp16     | bs=16, lr=1e-05  |  /  |  /  |  /   |  / |  /     | 55.0/64.0 | /  |
+| K100单机8卡（1x8）  | fp16     | bs=8, lr=1e-05   |  /  |  /  |   /  |  / | 0.804 | 29.4/64.0 |  / |
+2*8因机器资源紧张，将于2024.07补充相关结果
\ No newline at end of file
diff --git a/training/dcu/glm-pytorch/config/config_K100x1x1.py b/training/dcu/glm-pytorch/config/config_K100x1x1.py
new file mode 100644
index 000000000..4446c704e
--- /dev/null
+++ b/training/dcu/glm-pytorch/config/config_K100x1x1.py
@@ -0,0 +1,19 @@
+train_batch_size = 16
+eval_batch_size =16
+
+max_samples_termination = 24135
+
+dist_backend = "nccl"
+
+lr = 1e-5
+weight_decay = 0.1
+adam_beta1 = 0.9
+adam_beta2 = 0.999
+adam_eps = 1e-08
+gradient_accumulation_steps = 1
+warmup = 0.1
+lr_decay_ratio = 0.1
+lr_decay_iters = 4338
+log_freq = 1
+
+training_event = None
diff --git a/training/dcu/glm-pytorch/config/config_K100x1x8.py b/training/dcu/glm-pytorch/config/config_K100x1x8.py
new file mode 100644
index 000000000..70311cdc8
--- /dev/null
+++ b/training/dcu/glm-pytorch/config/config_K100x1x8.py
@@ -0,0 +1,18 @@
+train_batch_size = 8
+eval_batch_size = 8
+
+dist_backend = "nccl"
+
+lr = 1e-5
+weight_decay = 0.1
+adam_beta1 = 0.9
+adam_beta2 = 0.999
+adam_eps = 1e-08
+gradient_accumulation_steps = 1
+warmup = 0.1
+lr_decay_ratio = 0.1
+lr_decay_iters = 4338
+log_freq = 1
+seed = 10483
+max_samples_termination = 5553080
+training_event = None
diff --git a/training/dcu/glm-pytorch/config/environment_variables.sh b/training/dcu/glm-pytorch/config/environment_variables.sh
new file mode 100644
index 000000000..68fad820a
--- /dev/null
+++ b/training/dcu/glm-pytorch/config/environment_variables.sh
@@ -0,0 +1,6 @@
+# =================================================
+# Export variables
+# =================================================
+
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export GPU_FLUSH_ON_EXECUTION=1
diff --git a/training/dcu/glm-pytorch/config/requirements.txt b/training/dcu/glm-pytorch/config/requirements.txt
new file mode 100644
index 000000000..8599cad5e
--- /dev/null
+++ b/training/dcu/glm-pytorch/config/requirements.txt
@@ -0,0 +1,3 @@
+h5sparse
+boto3
+h5py
\ No newline at end of file
diff --git a/training/dcu/glm-pytorch/extern/converter.py b/training/dcu/glm-pytorch/extern/converter.py
new file mode 100644
index 000000000..330bee79a
--- /dev/null
+++ b/training/dcu/glm-pytorch/extern/converter.py
@@ -0,0 +1,21 @@
+from driver import dist_pytorch
+from .layers.transformer import GLMTransformer
+
+
+def convert_model(model, config):
+    if dist_pytorch.get_rank() == 0:
+        print("use apex layer norm", flush=True)
+    state_dict = model.state_dict()
+    transformer_layer = GLMTransformer(
+        num_layers=config.num_layers,
+        hidden_size=config.hidden_size,
+        num_attention_heads=config.num_attention_heads,
+        max_sequence_length=config.max_seq_length,
+        max_memory_length=config.max_memory_length,
+        embedding_dropout_prob=config.hidden_dropout,
+        attention_dropout_prob=config.attention_dropout,
+        output_dropout_prob=config.hidden_dropout,
+        checkpoint_activations=config.checkpoint_activations)
+    model.model.transformer = transformer_layer
+    model.load_state_dict(state_dict, strict=True)
+    return model
diff --git a/training/dcu/glm-pytorch/extern/layers/__init__.py b/training/dcu/glm-pytorch/extern/layers/__init__.py
new file mode 100644
index 000000000..dab9da7e4
--- /dev/null
+++ b/training/dcu/glm-pytorch/extern/layers/__init__.py
@@ -0,0 +1 @@
+from .transformer import *
diff --git a/training/dcu/glm-pytorch/extern/layers/layernorm.py b/training/dcu/glm-pytorch/extern/layers/layernorm.py
new file mode 100644
index 000000000..96a935cc6
--- /dev/null
+++ b/training/dcu/glm-pytorch/extern/layers/layernorm.py
@@ -0,0 +1 @@
+from apex.normalization import FusedLayerNorm as LayerNorm
diff --git a/training/dcu/glm-pytorch/extern/layers/transformer.py b/training/dcu/glm-pytorch/extern/layers/transformer.py
new file mode 100644
index 000000000..82e98ed26
--- /dev/null
+++ b/training/dcu/glm-pytorch/extern/layers/transformer.py
@@ -0,0 +1,298 @@
+import torch
+import math
+
+from .transformer_block import GLMTransformerLayer
+from .layernorm import LayerNorm
+from model.models.checkpoint import checkpoint
+
+
+def scaled_init_method(sigma, num_layers):
+    """Init method based on N(0, sigma/sqrt(2*num_layers)."""
+    std = sigma / math.sqrt(2.0 * num_layers)
+
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=std)
+
+    return init_
+
+
+def unscaled_init_method(sigma):
+    """Init method based on N(0, sigma)."""
+
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
+
+    return init_
+
+
+class GLMTransformer(torch.nn.Module):
+    """GPT-2 transformer.
+
+    This module takes input from embedding layer and it's output can
+    be used directly by a logit layer. It consists of L (num-layers)
+    blocks of:
+        layer norm
+        self attention
+        residual connection
+        layer norm
+        mlp
+        residual connection
+    followed by a final layer norm.
+
+    Arguments:
+        num_layers: Number of transformer layers.
+        hidden_size: The hidden size of the self attention.
+        num_attention_heads: number of attention head in the self
+                             attention.
+        attention_dropout_prob: dropout probability of the attention
+                                score in self attention.
+        output_dropout_prob: dropout probability for the outputs
+                             after self attention and final output.
+        checkpoint_activations: if True, checkpoint activations.
+        checkpoint_num_layers: number of layers to checkpoint. This
+                               is basically the chunk size in checkpoitning.
+        layernorm_epsilon: epsilon used in layernorm to avoid
+                           division by zero.
+        init_method_std: standard deviation of the init method which has
+                         the form N(0, std).
+        use_scaled_init_for_output_weights: If Ture use 1/sqrt(2*num_layers)
+                                            scaling for the output weights (
+                                            output of self attention and mlp).
+    """
+
+    def __init__(
+        self,
+        num_layers,
+        hidden_size,
+        num_attention_heads,
+        max_sequence_length,
+        max_memory_length,
+        embedding_dropout_prob,
+        attention_dropout_prob,
+        output_dropout_prob,
+        checkpoint_activations,
+        checkpoint_num_layers=1,
+        layernorm_epsilon=1.0e-5,
+        init_method_std=0.02,
+        use_scaled_init_for_output_weights=True,
+        block_position_encoding=True,
+        attention_scale=1.0,
+    ):
+        super(GLMTransformer, self).__init__()
+        self.hidden_size = hidden_size
+        # Store activation checkpoiting flag.
+        self.checkpoint_activations = checkpoint_activations
+        self.checkpoint_num_layers = checkpoint_num_layers
+        self.max_memory_length = max_memory_length
+
+        output_layer_init_method = None
+        if use_scaled_init_for_output_weights:
+            output_layer_init_method = scaled_init_method(
+                init_method_std, num_layers)
+        # Embeddings dropout
+        self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)
+        self.block_position_encoding = block_position_encoding
+
+        # Position embedding (serial).
+        if block_position_encoding:
+            self.position_embeddings = torch.nn.Embedding(
+                max_sequence_length + 1, hidden_size)
+            self.block_position_embeddings = torch.nn.Embedding(
+                max_sequence_length + 1, hidden_size)
+            torch.nn.init.normal_(self.block_position_embeddings.weight,
+                                  mean=0.0,
+                                  std=init_method_std)
+        else:
+            self.position_embeddings = torch.nn.Embedding(
+                max_sequence_length, hidden_size)
+        # Initialize the position embeddings.
+        torch.nn.init.normal_(self.position_embeddings.weight,
+                              mean=0.0,
+                              std=init_method_std)
+
+        def get_layer():
+
+            return GLMTransformerLayer(
+                hidden_size,
+                num_attention_heads,
+                attention_dropout_prob,
+                output_dropout_prob,
+                layernorm_epsilon,
+                unscaled_init_method(init_method_std),
+                output_layer_init_method=output_layer_init_method,
+                relative_encoding=False,
+                performer=False,
+                attention_scale=attention_scale)
+
+        # Transformer layers.
+        self.layers = torch.nn.ModuleList(
+            [get_layer() for _ in range(num_layers)])
+
+        # Final layer norm before output.
+        self.final_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon)
+
+        # if deepspeed.checkpointing.is_configured():
+        #     global get_cuda_rng_tracker, checkpoint
+        #     get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker
+        #     checkpoint = deepspeed.checkpointing.checkpoint
+
+    def forward(self,
+                hidden_states,
+                position_ids,
+                attention_mask,
+                memory_states=None,
+                encoder_states=None,
+                return_memory=False,
+                detach_memory=True):
+        batch_size, query_length = hidden_states.size()[:2]
+        memory_length = memory_states[0].size(1) if memory_states else 0
+        key_length = query_length + memory_length
+        # attention mask is the beginning postion of B region, \in [0, query_len)
+        is_scalar = torch.numel(attention_mask) == 1
+        is_sep = is_scalar or torch.numel(attention_mask) == batch_size
+
+        if is_sep:
+            sep = attention_mask.item() if is_scalar else attention_mask
+
+            # conventional transformer
+            def build_mask_matrix(seq_length, sep, memory_length=0):
+                m = hidden_states.new_ones((1, seq_length, seq_length))
+                m = torch.tril(m)
+                if is_scalar:
+                    m[0, :, :sep] = 1
+                else:
+                    m = m.expand(batch_size, -1, -1)
+                    ids = torch.arange(seq_length,
+                                       device=sep.device,
+                                       dtype=sep.dtype).view(1, -1)
+                    mask = ids < sep.view(-1, 1)
+                    m = m.masked_fill(mask.unsqueeze(1).expand_as(m), 1)
+                if memory_length > 0:
+                    m = m.expand(batch_size, -1, -1)
+                    m = torch.cat((hidden_states.new_ones(
+                        (batch_size, seq_length, memory_length)), m),
+                                  dim=2)
+                m = m.unsqueeze(1)
+                return m
+
+            attention_mask = build_mask_matrix(query_length,
+                                               sep,
+                                               memory_length=memory_length)
+        else:
+            attention_mask = attention_mask[:, :, :,
+                                            -query_length - memory_length:]
+
+        if self.block_position_encoding:
+            position_ids, block_position_ids = position_ids[:,
+                                                            0], position_ids[:,
+                                                                             1]
+        position_embeddings = self.position_embeddings(position_ids)
+        hidden_states = hidden_states + position_embeddings
+        if self.block_position_encoding:
+            block_position_embeddings = self.block_position_embeddings(
+                block_position_ids)
+            hidden_states = hidden_states + block_position_embeddings
+        hidden_states = self.embedding_dropout(hidden_states)
+
+        def check_detach(_hidden_states):
+            if detach_memory:
+                return _hidden_states.detach()
+            return _hidden_states
+
+        if self.max_memory_length > 0 or return_memory:
+            mem_layers = [check_detach(hidden_states)]
+        else:
+            mem_layers = []
+
+        def custom(start, end):
+
+            def custom_forward(*inputs):
+                layers_ = self.layers[start:end]
+                x_, inputs = inputs[0], inputs[1:]
+
+                inputs, mems_ = inputs[:1], inputs[1:]
+                for i, layer in enumerate(layers_):
+                    mem_i_ = mems_[i] if mems_ else None
+                    x_ = layer(x_, *inputs, mem=mem_i_)
+                    if self.max_memory_length > 0 or return_memory:
+                        mem_layers.append(check_detach(x_))
+                return x_
+
+            return custom_forward
+
+        if self.checkpoint_activations:
+            l = 0
+            num_layers = len(self.layers)
+            chunk_length = self.checkpoint_num_layers
+            while l < num_layers:
+                args = [hidden_states, attention_mask]
+                if memory_states:
+                    args += memory_states[l:l + chunk_length]
+                hidden_states = checkpoint(custom(l, l + chunk_length), *args)
+                l += chunk_length
+        else:
+            for i, layer in enumerate(self.layers):
+                args = [hidden_states, attention_mask]
+                mem_i = memory_states[i] if memory_states else None
+                hidden_states = layer(*args, mem=mem_i)
+                if self.max_memory_length > 0 or return_memory:
+                    mem_layers.append(check_detach(hidden_states))
+
+        # Final layer norm.
+        output = self.final_layernorm(hidden_states)
+        if self.max_memory_length > 0 or return_memory:
+            mem_layers = self.update_mems(mem_layers,
+                                          memory_states,
+                                          return_memory=return_memory)
+
+        return (output, mem_layers)
+
+    def update_mems(self, hiddens, mems, return_memory=False):
+        memory_length = mems[0].size(1) if mems else 0
+        query_length = hiddens[0].size(1)
+        new_memory_length = memory_length + query_length
+        if not return_memory:
+            new_memory_length = min(self.max_memory_length, new_memory_length)
+        new_mems = []
+        # with torch.no_grad():
+        for i in range(len(hiddens)):
+            if new_memory_length <= query_length:
+                new_mems.append(hiddens[i][:, -new_memory_length:])
+            else:
+                new_mems.append(
+                    torch.cat((mems[i][:, -new_memory_length + query_length:],
+                               hiddens[i]),
+                              dim=1))
+        return new_mems
+
+
+if __name__ == "__main__":
+
+    batch_size = 2
+    seq_len = 512
+    hidden_size = 1024
+    hidden_states = torch.rand([batch_size, seq_len, hidden_size],
+                               dtype=torch.float32).to("cuda")
+    position_ids = torch.ones([batch_size, 2, seq_len],
+                              dtype=torch.int64).to('cuda')
+    attention_mask = torch.tensor([5, 10]).to('cuda')
+
+    model = GLMTransformer(num_layers=24,
+                           hidden_size=1024,
+                           num_attention_heads=16,
+                           max_sequence_length=512,
+                           max_memory_length=0,
+                           embedding_dropout_prob=0.1,
+                           attention_dropout_prob=0.1,
+                           output_dropout_prob=0.1,
+                           checkpoint_activations=True,
+                           checkpoint_num_layers=1,
+                           layernorm_epsilon=1.0e-5,
+                           init_method_std=0.02,
+                           use_scaled_init_for_output_weights=True,
+                           block_position_encoding=True,
+                           attention_scale=1.0).to('cuda')
+
+    outputs = model(hidden_states, position_ids, attention_mask)
+    print(outputs[0].shape)
+    print(outputs[1])
diff --git a/training/dcu/glm-pytorch/extern/layers/transformer_block.py b/training/dcu/glm-pytorch/extern/layers/transformer_block.py
new file mode 100644
index 000000000..c3d33466b
--- /dev/null
+++ b/training/dcu/glm-pytorch/extern/layers/transformer_block.py
@@ -0,0 +1,125 @@
+import torch
+
+from model.layers.attention import SelfAttention
+from .layernorm import LayerNorm
+from model.layers.mlp import GLMMLP
+
+
+class GLMTransformerLayer(torch.nn.Module):
+    """A single layer transformer for GPT2.
+
+    We use the following notation:
+        h: hidden size
+        n: number of attention heads
+        b: batch size
+        s: sequence length
+    Transformore layer takes input with size [b, s, h] and returns an
+    output of the same size.
+
+    Arguments:
+        hidden_size: The hidden size of the self attention.
+        num_attention_heads: number of attention head in the self
+                             attention.
+        attention_dropout_prob: dropout probability of the attention
+                                score in self attention.
+        output_dropout_prob: dropout probability for the outputs
+                             after self attention and final output.
+        layernorm_epsilon: epsilon used in layernorm to avoid
+                           division by zero.
+        init_method: initialization method used for the weights. Note
+                     that all biases are initialized to zero and
+                     layernorm weight are initialized to one.
+        output_layer_init_method: output layers (attention output and
+                                  mlp output) initialization. If None,
+                                  use `init_method`.
+    """
+
+    def __init__(self,
+                 hidden_size,
+                 num_attention_heads,
+                 attention_dropout_prob,
+                 output_dropout_prob,
+                 layernorm_epsilon,
+                 init_method,
+                 output_layer_init_method=None,
+                 relative_encoding=False,
+                 performer=False,
+                 attention_scale=1.0):
+        super(GLMTransformerLayer, self).__init__()
+        # Set output layer initialization if not provided.
+        if output_layer_init_method is None:
+            output_layer_init_method = init_method
+
+        # Layernorm on the input data.
+        self.input_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon)
+
+        # Self attention.
+        self.attention = SelfAttention(
+            hidden_size,
+            num_attention_heads,
+            attention_dropout_prob,
+            output_dropout_prob,
+            init_method,
+            output_layer_init_method=output_layer_init_method,
+            relative_encoding=relative_encoding,
+            performer=performer,
+            attention_scale=attention_scale)
+
+        # Layernorm on the input data.
+        self.post_attention_layernorm = LayerNorm(hidden_size,
+                                                  eps=layernorm_epsilon)
+
+        # MLP
+        self.mlp = GLMMLP(hidden_size,
+                          output_dropout_prob,
+                          init_method,
+                          output_layer_init_method=output_layer_init_method)
+
+    def forward(self,
+                hidden_states,
+                ltor_mask,
+                position_embeddings=None,
+                r_w_bias=None,
+                r_r_bias=None,
+                mem=None):
+        # hidden_states: [b, s, h]
+        # ltor_mask: [1, 1, s, s]
+
+        # Layer norm at the begining of the transformer layer.
+        layernorm_output = self.input_layernorm(hidden_states)
+        mem = self.input_layernorm(mem) if mem is not None else None
+        # Self attention.
+        attention_output = self.attention(layernorm_output, ltor_mask,
+                                          position_embeddings, r_w_bias,
+                                          r_r_bias, mem)
+        # Residual connection.
+        layernorm_input = hidden_states + attention_output
+        # Layer norm post the self attention.
+        layernorm_output = self.post_attention_layernorm(layernorm_input)
+        # MLP.
+        mlp_output = self.mlp(layernorm_output)
+        # Second residual connection.
+        output = layernorm_input + mlp_output
+
+        return output
+
+
+if __name__ == "__main__":
+    batch_size = 8
+    seq_len = 512
+    hidden_size = 1024
+    num_attention_heads = 16
+    attention_dropout_prob = 0.1
+    output_dropout_prob = 0.1
+    layernorm_epsilon = 1e-10
+    init_method = torch.nn.init.xavier_normal_
+    test_transformer = GLMTransformerLayer(hidden_size, num_attention_heads,
+                                           attention_dropout_prob,
+                                           output_dropout_prob,
+                                           layernorm_epsilon, init_method)
+
+    hidden_states = torch.rand([batch_size, seq_len, hidden_size])
+    ltor_mask = torch.ones([1, 1, seq_len, seq_len])
+
+    outputs = test_transformer(hidden_states, ltor_mask)
+    print(outputs.shape)
diff --git a/training/dcu/glm-pytorch/extern/trainer_adapter.py b/training/dcu/glm-pytorch/extern/trainer_adapter.py
new file mode 100644
index 000000000..d4c3837a5
--- /dev/null
+++ b/training/dcu/glm-pytorch/extern/trainer_adapter.py
@@ -0,0 +1,80 @@
+import torch
+import config
+
+from torch import nn
+
+from .converter import convert_model as _convert_model
+from driver.dist_pytorch import main_proc_print
+from typing import Tuple
+from model.models.modeling import FP16_Module
+from driver.dist_pytorch import PyTorchDistributedDataParallel as TorchDDP
+
+from optimizers.loss_scaler import DynamicLossScaler
+
+clip_grad_norm = torch.nn.utils.clip_grad_norm_
+
+
+def convert_model(model: torch.nn.Module) -> torch.nn.Module:
+    return _convert_model(model, config)
+
+
+def model_to_fp16(model):
+    # To prevent OOM for model sizes that cannot fit in GPU memory in full precision
+    if config.fp16:
+        main_proc_print(" > use fp16...")
+        model.half()
+
+    # GPU allocation.
+    model.cuda(torch.cuda.current_device())
+
+    # Fp16 conversion.
+    if config.fp16:
+        model = FP16_Module(model)
+    return model
+
+
+def model_to_ddp(model: nn.Module) -> nn.Module:
+    i = torch.cuda.current_device()
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        model = TorchDDP(model, device_ids=[i], output_device=i)
+    return model
+
+
+def backward(step, lm_loss, reduced_loss, optimizer, lr_scheduler, model):
+    args = config
+
+    if not DynamicLossScaler._has_inf_or_nan(reduced_loss):
+        backward_step(optimizer, model, lm_loss, args)
+        if step % args.gradient_accumulation_steps == 0:
+            optimizer.step()
+            if not (args.fp16 and optimizer.overflow):
+                lr_scheduler.step()
+            optimizer.zero_grad()
+
+    else:
+        main_proc_print("Found NaN loss, skip backward")
+    return reduced_loss
+
+
+def backward_step(optimizer, model, lm_loss, args):
+    """Backward step."""
+
+    # Total loss.
+    loss = lm_loss
+
+    if args.fp16:
+        optimizer.backward(loss, update_master_grads=False)
+    else:
+        loss.backward()
+
+    if args.fp16:
+        optimizer.update_master_grads()
+
+    # Clipping gradients helps prevent the exploding gradient.
+    if args.clip_grad > 0:
+        if not args.fp16:
+            clip_grad_norm(model.parameters(), args.clip_grad)
+        else:
+            optimizer.clip_master_grads(args.clip_grad)
+
+    return lm_loss
diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py
index 41397ffbc..ae5778abc 100644
--- a/training/run_benchmarks/config/test_conf.py
+++ b/training/run_benchmarks/config/test_conf.py
@@ -1,7 +1,7 @@
 '''Test Configs, including'''
 # -*-coding:utf-8 -*-
 
-# Set accelerator's vendor name, e.g. iluvatar, cambricon, kunlunxin, ascend, mthreads and metax.
+# Set accelerator's vendor name, e.g. iluvatar, cambricon, kunlunxin, ascend, mthreads, metax and dcu.
 # We will run benchmarks in training/<vendor>
 VENDOR = "nvidia"
 
@@ -23,6 +23,8 @@
 #       " --env MTHREADS_VISIBLE_DEVICES=all"
 #   metax:
 #       " --device=/dev/dri --device=/dev/mxcd --group-add video"
+#   dcu:
+#       "-v /opt/hyhal/:/opt/hyhal/ --device=/dev/kfd --device=/dev/dri/ --group-add video"
 ACCE_CONTAINER_OPT = " --gpus all"
 # XXX_VISIBLE_DEVICE item name in env
 # possible value of ACCE_VISIBLE_DEVICE_ENV_NAME are:
@@ -31,6 +33,7 @@
 #   XPU_VISIBLE_DEVICES for kunlunxin
 #   ASCEND_VISIBLE_DEVICES for ascend
 #   MUSA_VISIBLE_DEVICES for mthreads
+#   HIP_VISIBLE_DEVICES for dcu
 ACCE_VISIBLE_DEVICE_ENV_NAME = "CUDA_VISIBLE_DEVICES"
 
 # Set pip source, which will be used in preparing envs in container
@@ -79,14 +82,14 @@
     
     # "wav2vec2:pytorch_1.13:A100:1:8:1": "/raid/dataset/wav2vec2_data/LibriSpeech",
     # "WaveGlow:pytorch_1.13:A100:1:8:1": "/raid/dataset/LJSpeech/",
-    "resnet50:tensorflow2:A100:1:8:1": "/raid/dataset/ImageNet2012/tf_records/",
+    # "resnet50:tensorflow2:A100:1:8:1": "/raid/dataset/ImageNet2012/tf_records/",
 
     # "distilbert:pytorch_1.12:A100:1:8:1": "/raid/dataset/distilbert/",
     
     # "transformer:pytorch_1.13:A100:1:8:1": "/raid/dataset/transformer/wmt14_en_de_joined_dict",
     # "swin_transformer:pytorch_1.8:A100:1:8:1": "/raid/dataset/ImageNet_1k_2012/",
     # "transformer_xl:pytorch_1.8:A100:1:8:1": "/raid/dataset/transformer_xl/",
-    "t5_small:pytorch_1.12:A100:1:8:1": "/raid/dataset/t5_small_train",
+    # "t5_small:pytorch_1.12:A100:1:8:1": "/raid/dataset/t5_small_train",
     # "gpt2:pytorch_1.12:A100:1:8:1": "/raid/dataset/gpt2",
 
     # "bert_hf:pytorch_1.13:A100:1:8:1": "/raid/dataset/bert_hf_train",
@@ -160,5 +163,9 @@
     # "mobilenetv2:pytorch_2.0:C500:1:8:1": "/raid/dataset/ImageNet_1k_2012/",
     # "mask_rcnn:pytorch_2.0:C500:1:8:1": "/raid/dataset/coco2017/",
     # "detr:pytorch_2.0:C500:1:8:1": "/raid/dataset/coco2017/",
+
+    # dcu cases
+    # "glm:pytorch_1.13:K100:1:8:1": "/home/chenych/datasets/glm_train_datset/",
+    
 }
 

From 077321adb62d641b39d879ac37a20706dafc13f6 Mon Sep 17 00:00:00 2001
From: jsnoc <61768944+jsnoc@users.noreply.github.com>
Date: Tue, 5 Mar 2024 19:23:40 +0800
Subject: [PATCH 3/4] add resnet infer metax (#474)

Co-authored-by: yaguang.wuyaguang <yaguang.wu@metax-tech.com>
---
 inference/benchmarks/resnet50/README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/inference/benchmarks/resnet50/README.md b/inference/benchmarks/resnet50/README.md
index a42cc9948..c689f6a17 100644
--- a/inference/benchmarks/resnet50/README.md
+++ b/inference/benchmarks/resnet50/README.md
@@ -142,4 +142,5 @@ find ./val -name "*JPEG" | wc -l
 | kunlunxin_xtcl | fp32   | 128  | / | /  | / | /      | /      | / | 76.2/76.2 | 4.52/32.0 |
 | kunlunxin_xtcl | fp16   | 256  | / | /  | /   |  / | /  | / | 76.2/76.2 | 4.52/32.0 |
 | zixiao | fp16   | 32*6  | 261.103    | /      | /     |  193.151    | 6342.191  | / | 76.2/76.2 | / |
-
+| metax-nocompiler | fp16      | 256  |/ | /   | / | /   | / | 7.8% | 76.2/76.2 | 3.83/64.0 |
+| metax-nocompiler | fp32   | 256  | /    | /      | /     | /        | /  | 7.7% | 76.2/76.2 | 5.46/64.0 |

From a67831d4ca9a5ac2fa8df3bb8fe0e320dde9fa2d Mon Sep 17 00:00:00 2001
From: Kathrine <huanandong94@gmail.com>
Date: Wed, 6 Mar 2024 13:49:02 +0800
Subject: [PATCH 4/4] [metax] add bert_large inference result (#476)

* add bert_hf result

* Update README.md

1

* add glm result

* [metax] Update glm README.md

* update metax bertlarge inference result

* update metax bert_large inference result

* Update README.md
---
 inference/benchmarks/bertLarge/README.md | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/inference/benchmarks/bertLarge/README.md b/inference/benchmarks/bertLarge/README.md
index d7eeb4aeb..e627f7a7b 100644
--- a/inference/benchmarks/bertLarge/README.md
+++ b/inference/benchmarks/bertLarge/README.md
@@ -77,6 +77,19 @@ bert_reference_results_text_md5.txt
    - IXRT: ixrt-0.8.0+corex.3.2.1
 
 
+####  2.4 沐曦集成电路 C500 
+
+- ##### 硬件环境
+    - 机器、加速卡型号: 曦云®C500 64G
+- ##### 软件环境
+   - OS版本：Ubuntu 20.04.6
+   - OS kernel版本:  5.4.0-26-generic
+   - 加速卡驱动版本：2.2.0
+   - Docker 版本：24.0.7
+   - 推理框架版本：pytorch-2.0.0+mc2.18.0.8-cp38-cp38-linux_x86_64.whl
+
+
+
 ### 4. 运行情况（BERT-Large）
 
 * 指标列表
@@ -103,4 +116,6 @@ bert_reference_results_text_md5.txt
 | tensorrt | fp32   | 32 | 1868.8   | 150.4       | 152.2      | 190.4         | 194.1       | 42.0% | 0.638/0.638 | 16.9/40.0 |
 | kunlunxin_xtcl| W32A16   | 32 |/ | /          | /       | /          | /          | / | 0.638/0.638| /|
 | iluvatar_ixrt| fp16  | 32 |/ | /          | /       | /          | /          | / | 0.599/0.638| /|
+| metax-nocompiler| fp16   | 32 |/ | /          | /       | /          | /          | 27.6% | 0.638/0.638| 4.3/64.0|
+| metax-nocompiler| fp32   | 32 |/ | /          | /       | /          | /          | 28.1% | 0.639/0.638| 6.1/64.0|