diff --git a/inference/benchmarks/stable_diffusion_v1_4/README.md b/inference/benchmarks/stable_diffusion_v1_4/README.md
index 0df0c5920..c73c77b4a 100644
--- a/inference/benchmarks/stable_diffusion_v1_4/README.md
+++ b/inference/benchmarks/stable_diffusion_v1_4/README.md
@@ -59,3 +59,5 @@
 | kunlunxin_xtcl | fp32   | 2 | / | / | / | / | / | / | 26.524/25.3 | 0.07/32.0 |
 | null | fp16 | 16 | / | 11.7 | 60.7 | /  | / | 13.2% | -/25.2 | 5.7/40.0 |
 | null | fp32 | 8 | / | 9.3 | 27.3 | /  | / | 11.9% | -/25.3 | 6.3/40.0 |
+| metax-nocompiler | fp16 | 64 | / | / | / | /  | / | 12.7% | -/25.4 | 14.7/64.0 |
+| metax-nocompiler | fp32 | 16 | / | / | / | /  | / | 10.3% | -/25.4 | 55.57/64.0 |
diff --git a/inference/configs/host.yaml b/inference/configs/host.yaml
index 7ad020c8e..191867429 100644
--- a/inference/configs/host.yaml
+++ b/inference/configs/host.yaml
@@ -8,10 +8,13 @@ SSH_PORT: "22"
 HOSTS_PORTS: ["2222"]
 MASTER_PORT: "29501"
 SHM_SIZE: "32G"
+#   metax:
+#       " --device=/dev/dri --device=/dev/mxcd --group-add video"
 ACCE_CONTAINER_OPT: " --gpus all"
 PIP_SOURCE: "https://mirror.baidu.com/pypi/simple"
 CLEAR_CACHES: True
 ACCE_VISIBLE_DEVICE_ENV_NAME: "CUDA_VISIBLE_DEVICES"
 CASES: 
     # "resnet50:pytorch_1.13": "/raid/dataset/ImageNet/imagenet/val"
-    "vit_l_16:pytorch_2.1": "/raid/dataset/ImageNet_1k_2012/val"
\ No newline at end of file
+    "vit_l_16:pytorch_2.1": "/raid/dataset/ImageNet_1k_2012/val"
+    #"stable_diffusion_v1_4:pytorch_2.0": "/raid/dataset/stable_diffusion_v1_4/"
\ No newline at end of file
diff --git a/inference/docker_images/metax/metax_analysis.py b/inference/docker_images/metax/metax_analysis.py
new file mode 100644
index 000000000..ce863b57b
--- /dev/null
+++ b/inference/docker_images/metax/metax_analysis.py
@@ -0,0 +1,17 @@
+def analysis_log(logpath):
+    logfile = open(logpath)
+
+    max_usage = 0.0
+    max_mem = 0.0
+    for line in logfile.readlines():
+        if "MiB" in line:
+
+            usage_and_maxusage = line.split(" ")[2]
+            usage = float(usage_and_maxusage.split("/")[0])
+            max_usage = max(max_usage, usage)
+            max_mem = float(usage_and_maxusage.split("/")[1])
+            #max_mem = float(max_mem[:-3])
+            print (max_mem)
+            print (max_usage)
+    return round(max_usage / 1024.0,
+                 2), round(max_mem / 1024.0, 2), eval("120e12"), eval("240e12")
\ No newline at end of file
diff --git a/inference/docker_images/metax/metax_monitor.py b/inference/docker_images/metax/metax_monitor.py
new file mode 100644
index 000000000..20c69e9cf
--- /dev/null
+++ b/inference/docker_images/metax/metax_monitor.py
@@ -0,0 +1,256 @@
+# ！/usr/bin/env python3
+# encoding: utf-8
+'''
+Usage:  python3 sys-monitor.py -o operation -l [log_path]
+            -o, --operation     start|stop|restart|status
+            -l, --log           log path , ./logs/ default
+'''
+
+import os
+import sys
+import time
+import signal
+import atexit
+import argparse
+import datetime
+from multiprocessing import Process
+import subprocess
+import schedule
+
+
+class Daemon:
+    '''
+    daemon subprocess class.
+    usage: subclass this daemon and override the run() method.
+    sys-monitor.pid: in the /tmp/, auto del when unexpected exit.
+    verbose: debug mode, disabled default.
+    '''
+
+    def __init__(self,
+                 pid_file,
+                 log_file,
+                 err_file,
+                 gpu_log,
+                 log_path,
+                 rate=5,
+                 stdin=os.devnull,
+                 stdout=os.devnull,
+                 stderr=os.devnull,
+                 home_dir='.',
+                 umask=0o22,
+                 verbose=0):
+        self.stdin = stdin
+        self.stdout = stdout
+        self.stderr = stderr
+        self.home_dir = home_dir
+        self.verbose = verbose
+        self.pidfile = pid_file
+        self.logfile = log_file
+        self.errfile = err_file
+        self.gpufile = gpu_log
+        self.logpath = log_path
+        self.rate = rate
+        self.umask = umask
+        self.verbose = verbose
+        self.daemon_alive = True
+
+    def get_pid(self):
+        try:
+            with open(self.pidfile, 'r') as pf:
+                pid = int(pf.read().strip())
+        except IOError:
+            pid = None
+        except SystemExit:
+            pid = None
+        return pid
+
+    def del_pid(self):
+        if os.path.exists(self.pidfile):
+            os.remove(self.pidfile)
+
+    def run(self):
+        '''
+        NOTE: override the method in subclass
+        '''
+
+        def gpu_mon(file):
+            TIMESTAMP = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
+            cmd = "mx-smi |grep 'W' -m 1 | awk '{print $2, $3, $5,$6}' && mx-smi |grep 'MXC' -m 1 | awk '{print $7}'"
+            process = subprocess.Popen(cmd,
+                                       shell=True,
+                                       stdout=subprocess.PIPE,
+                                       stderr=subprocess.STDOUT,
+                                       encoding='utf-8')
+            try:
+                out = process.communicate(timeout=10)
+            except subprocess.TimeoutExpired:
+                process.kill()
+                out = process.communicate()
+
+            if process.returncode != 0:
+                result = "error"
+            result = TIMESTAMP + "\n" + out[0] + "\n"
+            with open(file, 'a') as f:
+                f.write(result)
+
+        def timer_gpu_mon():
+            gpu_process = Process(target=gpu_mon, args=(self.gpufile, ))
+            gpu_process.start()
+
+        schedule.every(self.rate).seconds.do(timer_gpu_mon)
+        while True:
+            schedule.run_pending()
+            time.sleep(5)
+
+    def daemonize(self):
+        if self.verbose >= 1:
+            print('daemon process starting ...')
+        try:
+            pid = os.fork()
+            if pid > 0:
+                sys.exit(0)
+        except OSError as e:
+            sys.stderr.write('fork #1 failed: %d (%s)\n' %
+                             (e.errno, e.strerror))
+            sys.exit(1)
+        os.chdir(self.home_dir)
+        os.setsid()
+        os.umask(self.umask)
+        try:
+            pid = os.fork()
+            if pid > 0:
+                sys.exit(0)
+        except OSError as e:
+            sys.stderr.write('fork #2 failed: %d (%s)\n' %
+                             (e.errno, e.strerror))
+            sys.exit(1)
+        sys.stdout.flush()
+        sys.stderr.flush()
+        si = open(self.stdin, 'r')
+        so = open(self.stdout, 'a+')
+        if self.stderr:
+            se = open(self.stderr, 'a+')
+        else:
+            se = so
+        os.dup2(si.fileno(), sys.stdin.fileno())
+        os.dup2(so.fileno(), sys.stdout.fileno())
+        os.dup2(se.fileno(), sys.stderr.fileno())
+        atexit.register(self.del_pid)
+        pid = str(os.getpid())
+        with open(self.pidfile, 'w+') as f:
+            f.write('%s\n' % pid)
+
+    def start(self):
+        if not os.path.exists(self.logpath):
+            os.makedirs(self.logpath)
+        elif os.path.exists(self.gpufile):
+            os.remove(self.gpufile)
+        if self.verbose >= 1:
+            print('ready to start ......')
+        # check for a pid file to see if the daemon already runs
+        pid = self.get_pid()
+        if pid:
+            msg = 'pid file %s already exists, is it already running?\n'
+            sys.stderr.write(msg % self.pidfile)
+            sys.exit(1)
+        # start the daemon
+        self.daemonize()
+        self.run()
+
+    def stop(self):
+        if self.verbose >= 1:
+            print('stopping ...')
+        pid = self.get_pid()
+        if not pid:
+            msg = 'pid file [%s] does not exist. Not running?\n' % self.pidfile
+            sys.stderr.write(msg)
+            if os.path.exists(self.pidfile):
+                os.remove(self.pidfile)
+            return
+        # try to kill the daemon process
+        try:
+            i = 0
+            while 1:
+                os.kill(pid, signal.SIGTERM)
+                time.sleep(1)
+                i = i + 1
+                if i % 10 == 0:
+                    os.kill(pid, signal.SIGHUP)
+        except OSError as err:
+            err = str(err)
+            if err.find('No such process') > 0:
+                if os.path.exists(self.pidfile):
+                    os.remove(self.pidfile)
+            else:
+                print(str(err))
+                sys.exit(1)
+            if self.verbose >= 1:
+                print('Stopped!')
+
+    def restart(self):
+        self.stop()
+        self.start()
+
+    def status(self):
+        pid = self.get_pid()
+        if pid:
+            if os.path.exists('/proc/%d' % pid):
+                return pid
+        return False
+
+
+def parse_args():
+    ''' Check script input parameter. '''
+    parse = argparse.ArgumentParser(description='Sys monitor script')
+    parse.add_argument('-o',
+                       type=str,
+                       metavar='[operation]',
+                       required=True,
+                       help='start|stop|restart|status')
+    parse.add_argument('-l',
+                       type=str,
+                       metavar='[log_path]',
+                       required=False,
+                       default='./logs/',
+                       help='log path')
+    args = parse.parse_args()
+    return args
+
+
+def main():
+    sample_rate1 = 5
+    args = parse_args()
+    operation = args.o
+    log_path = args.l
+    pid_fn = str('/tmp/gpu_monitor.pid')
+    log_fn = str(log_path + '/metax_monitor.log')
+    err_fn = str(log_path + '/metax_monitor.err')
+    # result for gpu
+    gpu_fn = str(log_path + '/metax_monitor.log')
+
+    subdaemon = Daemon(pid_fn,
+                       log_fn,
+                       err_fn,
+                       gpu_fn,
+                       log_path,
+                       verbose=1,
+                       rate=sample_rate1)
+    if operation == 'start':
+        subdaemon.start()
+    elif operation == 'stop':
+        subdaemon.stop()
+    elif operation == 'restart':
+        subdaemon.restart()
+    elif operation == 'status':
+        pid = subdaemon.status()
+        if pid:
+            print('process [%s] is running ......' % pid)
+        else:
+            print('daemon process [%s] stopped' % pid)
+    else:
+        print("invalid argument!")
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/inference/docker_images/metax/pytorch_2.0/Dockerfile b/inference/docker_images/metax/pytorch_2.0/Dockerfile
new file mode 100644
index 000000000..0a813966e
--- /dev/null
+++ b/inference/docker_images/metax/pytorch_2.0/Dockerfile
@@ -0,0 +1,11 @@
+FROM mxcr.io/library/maca-c500-pytorch:2.19.2.5-ubuntu18.04-amd64
+ENV PATH="/opt/conda/bin:${PATH}"
+ENV PYTORCH_USE_FLASHATTN=1
+RUN pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
+RUN pip install numpy
+RUN pip install pyyaml
+RUN pip install timm
+RUN pip install munch
+RUN pip install schedule
+RUN pip install loguru
+RUN /bin/bash -c "uname -a"
\ No newline at end of file
diff --git a/training/benchmarks/llama2_70B/megatron/megatron_main.sh b/training/benchmarks/llama2_70B/megatron/megatron_main.sh
index 812d57ae4..390588bc2 100644
--- a/training/benchmarks/llama2_70B/megatron/megatron_main.sh
+++ b/training/benchmarks/llama2_70B/megatron/megatron_main.sh
@@ -131,8 +131,10 @@ LOGGING_ARGS="
     --log-interval 1
 "
 
+CODE_PATH="/workspace/FlagScale/pretrain_llama.py"
+
 source $VENDOR_SHELL
-cmd="torchrun $DISTRIBUTED_ARGS /workspace/FlagScale/pretrain_llama.py \
+cmd="torchrun $DISTRIBUTED_ARGS $CODE_PATH \
               $TRAINING_ARGS \
               $MIXED_PRECISION_ARGS \
               $DATA_ARGS \
diff --git a/training/kunlunxin/docker_image/megatron/Dockerfile b/training/kunlunxin/docker_image/megatron/Dockerfile
new file mode 100644
index 000000000..d02bf6b01
--- /dev/null
+++ b/training/kunlunxin/docker_image/megatron/Dockerfile
@@ -0,0 +1,6 @@
+FROM iregistry.baidu-int.com/xmlir/xmlir_ubuntu_2004_x86_64:v0.27
+RUN /bin/bash -c "pip config set global.index-url https://mirror.baidu.com/pypi/simple"
+RUN /bin/bash -c "uname -a"
+RUN /bin/bash -c alias python3=python
+
+ENV PATH /root/miniconda/envs/python38_torch201_cuda/bin:$PATH
diff --git a/training/kunlunxin/docker_image/megatron/megatron_install.sh b/training/kunlunxin/docker_image/megatron/megatron_install.sh
new file mode 100644
index 000000000..e79533ee3
--- /dev/null
+++ b/training/kunlunxin/docker_image/megatron/megatron_install.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+# using github mirrors to avoid github TTL
+#export https_proxy=http://10.1.0.34:7890
+git clone https://githubfast.com/FlagOpen/FlagScale
+cd FlagScale
+
+git checkout eb0438a5459404e2e4c70b15fa37e9a197ab159d
+echo 'export PYTHONPATH=$PYTHONPATH:/home/FlagScale' >> /root/.bashrc
+source /root/.bashrc
+
+wget https://bd.bcebos.com/v1/klx-pytorch-work-bd/training/zhangling21_llama70B/xmlir201_5.run
+bash xmlir201_5.run
+XFLAGS --enable transformer_engine
+XFLAGS --enable flagscale
\ No newline at end of file
diff --git a/training/kunlunxin/llama2_70B-megatron/README.md b/training/kunlunxin/llama2_70B-megatron/README.md
new file mode 100644
index 000000000..ec69c5245
--- /dev/null
+++ b/training/kunlunxin/llama2_70B-megatron/README.md
@@ -0,0 +1,49 @@
+### 昆仑芯XPU配置与运行信息参考
+#### 环境配置
+- ##### 硬件环境
+  - 机器型号: 昆仑芯AI加速器组R480-X8
+  - 加速卡型号: 昆仑芯AI加速卡R300
+  - 多机网络类型、带宽: InfiniBand，200Gb/s
+
+- ##### 软件环境
+  - OS版本：Ubuntu 20.04
+  - OS kernel版本: 5.4.0-26-generic
+  - 加速卡驱动版本：4.0.25
+  - Docker镜像和版本：iregistry.baidu-int.com/xmlir/xmlir_ubuntu_2004_x86_64:v0.27
+  - 训练框架版本：xmlir
+  - 训练编译器版本：xacc
+  - 依赖软件版本：pytorch-2.0.1
+
+
+### 运行情况
+
+* 输入批尺寸
+  1. local_batchsize(micro_batchsize)，简写为LBS，即实际进入模型的张量批尺寸，为config_H100x4x8.py中所写，在本case中默认为1
+  2. seqlength(max_position_embedding)，简写为MPE，即实际进入模型的序列长度，为config_H100x4x8.py中所写，在本case中默认为4096
+  3. gradient_accumulate_steps，简写为GAS，即梯度累加步数，为ds_config.json中所写，在本case中默认为44
+  4. global_batchsize恒等于local_batchsize\*gradient_accumulate_steps\*data_parallel_size。在本case中，data_parallel_size=world_size/TPsize/PPsize。
+
+* 通用指标
+
+| 指标名称     | 指标值                     | 特殊说明                           |
+| ------------ | -------------------------- | ---------------------------------- |
+| 任务类别     | 自然语言理解               |                                    |
+| 模型         | llama2_70b                  |                                    |
+| 数据集       | pile wikipedia   |                                    |
+| 数据精度       | precision,见“性能指标”  | 可选fp32/amp/fp16/bf16                      |
+| 超参修改     | parallel,见“性能指标” | 格式为TPxPPyDPz，例如TP2PP1DP4 |
+| 超参修改     | fix_hp,见“性能指标”        | 跑满硬件设备评测吞吐量所需特殊超参 |
+| 硬件设备简称 | nvidia H800                |                                    |
+| 硬件存储使用 | mem,见“性能指标”           | 通常称为“显存”,单位为GiB           |
+| 计算使用率 | MFU,见“性能指标”           | 参见PaLM论文定义 |
+| **吞吐量**   | **token/p/s,见“性能指标”** | 平均单卡每秒处理的token数          |
+
+* 性能指标
+
+值得注意的是，下列第4组实验的global_batchsize与llama2原始论文相同, 训练100 step，此项实验也将作为精度对齐所用实验。
+
+| 配置                | precision | parallel  | fix_hp                       | token/p/s | 是否精度对齐 | mem   | MFU |
+| ------------------- | --------- | --------- | ---------------------------- | --------- | ----- | ----- | --- |
+| R300十机80卡（10x8） | fp32      | TP8PP10DP1 | /                            | /         | /     | 21/32 | /   |
+| R300十机80卡（10x8） | amp       | TP8PP10DP1 | GAS=1024(GBS=1024=4M tokens) | /         | doing* | 21/32 | /   |
+因缺少R300机器，在单卡R300与单卡GPU上初步验证精度。目前已通过减小模型层数的方式，在单卡R300与单卡GPU上验证精度。完整70B模型的精度验证进行中。
diff --git a/training/kunlunxin/llama2_70B-megatron/config/config_R300x10x8.py b/training/kunlunxin/llama2_70B-megatron/config/config_R300x10x8.py
new file mode 100644
index 000000000..c56990d52
--- /dev/null
+++ b/training/kunlunxin/llama2_70B-megatron/config/config_R300x10x8.py
@@ -0,0 +1,10 @@
+seqlength = 4096
+batchsize = 1
+accumulate_steps = 44
+train_tokens = 100000000
+theoryflops = 256000000000000.0
+epochs = 1
+flashattn = False
+recompute = False
+tensor_parallel = 8
+pipeline_parallel = 10
diff --git a/training/kunlunxin/llama2_70B-megatron/config/environment_variables.sh b/training/kunlunxin/llama2_70B-megatron/config/environment_variables.sh
new file mode 100644
index 000000000..90a1a4e5e
--- /dev/null
+++ b/training/kunlunxin/llama2_70B-megatron/config/environment_variables.sh
@@ -0,0 +1 @@
+export PATH=/root/miniconda/envs/python38_torch201_cuda/bin:$PATH
\ No newline at end of file
diff --git a/training/kunlunxin/llama2_70B-megatron/config/requirements.txt b/training/kunlunxin/llama2_70B-megatron/config/requirements.txt
new file mode 100644
index 000000000..ad213956e
--- /dev/null
+++ b/training/kunlunxin/llama2_70B-megatron/config/requirements.txt
@@ -0,0 +1 @@
+sentencepiece
\ No newline at end of file
diff --git a/training/kunlunxin/llama2_70B-megatron/config/training_adapter.sh b/training/kunlunxin/llama2_70B-megatron/config/training_adapter.sh
new file mode 100644
index 000000000..fcc2210d5
--- /dev/null
+++ b/training/kunlunxin/llama2_70B-megatron/config/training_adapter.sh
@@ -0,0 +1,61 @@
+export PYTHONPATH=$PYTHONPATH:/home/FlagScale
+
+MIXED_PRECISION_ARGS=""
+
+CODE_PATH="/home/FlagScale/pretrain_llama.py"
+
+TRAINING_ARGS="
+    --train-samples $TRAIN_SAMPLES \
+    --eval-iters 0 \
+    --tensor-model-parallel-size $TP \
+    --pipeline-model-parallel-size $PP \
+    --micro-batch-size $M_BATCHSIZE \
+    --global-batch-size $G_BATCHSIZE \
+    --disable-bias-linear \
+    --optimizer adam \
+    --no-gradient-accumulation-fusion \
+    --recompute-granularity 'full' \
+    --recompute-num-layers 1 \
+    --recompute-method 'uniform' \
+    --no-async-tensor-model-parallel-allreduce \
+    --distribute-saved-activations
+"
+NETWORK_ARGS="
+    --num-layers 80 \
+    --hidden-size 8192 \
+    --num-attention-heads 64 \
+    --ffn-hidden-size 28672 \
+    --seq-length $SEQLENGTH \
+    --max-position-embeddings $SEQLENGTH \
+    --normalization RMSNorm \
+    --group-query-attention \
+    --num-query-groups 8 \
+    --use-rotary-position-embeddings \
+    --no-position-embedding \
+    --swiglu \
+    --multiple-of 4096 \
+    --untie-embeddings-and-output-weights
+"
+
+
+export BKCL_CCIX_BUFFER_GM=1
+export BKCL_CCIX_RING=1
+export BKCL_TREE_THRESHOLD=1
+
+export BKCL_SOCKET_IFNAME=ibs11
+export BKCL_USE_RDMA=0
+
+export BKCL_RDMA_FORCE_TREE=1
+export BKCL_ENABLE_XDR=0
+export BKCL_RING_BUFFER_SIZE=1024000
+export BKCL_RDMA_NICS=ibs11
+export BKCL_FORCE_ALLREDUCE_IN_MULTINODE=1
+worker_num=0
+
+ulimit -c 0
+export XMLIR_F_XPU_ENABLED_BOOL=true
+export ALLREDUCE_ASYNC=false
+export ALLGATHER_ASYNC=false
+export ALLREDUCE_FUSION=0
+export BKCL_TIMEOUT=1800
+export BKCL_FORCE_SYNC=1
\ No newline at end of file
diff --git a/training/metax/bert_hf-pytorch/README.md b/training/metax/bert_hf-pytorch/README.md
index 931958679..0b23692f7 100644
--- a/training/metax/bert_hf-pytorch/README.md
+++ b/training/metax/bert_hf-pytorch/README.md
@@ -1,4 +1,4 @@
-### Nvidia GPU配置与运行信息参考
+###  沐曦集成电路 C500 GPU配置与运行信息参考
 #### 环境配置
 - ##### 硬件环境
     - 机器、加速卡型号: 曦云®C500 64G
diff --git a/training/metax/detr-pytorch/README.md b/training/metax/detr-pytorch/README.md
new file mode 100644
index 000000000..a16e7260d
--- /dev/null
+++ b/training/metax/detr-pytorch/README.md
@@ -0,0 +1,45 @@
+### 测试数据集下载
+参见[测试数据集下载](../../benchmarks/detr/README.md#测试数据集下载地址)
+
+### 沐曦集成电路 C500 GPU配置与运行信息参考
+#### 环境配置
+- ##### 硬件环境
+    - 机器、加速卡型号: 曦云®C500 64G
+    - 多机网络类型、带宽: InfiniBand，2x200 Gb/s
+
+- ##### 软件环境
+   - OS版本：Ubuntu 20.04.6
+   - OS kernel版本:  5.4.0-26-generic
+   - 加速卡驱动版本：2.2.0
+   - Docker 版本：24.0.7
+   - 训练框架版本：pytorch-2.0.0+mc2.19.0.6-cp38-cp38-linux_x86_64.whl
+   - 依赖软件版本：无
+
+#### 运行情况
+
+* 通用指标
+
+| 指标名称       | 指标值                         | 特殊说明                                    |
+| -------------- | ------------------------------ | ------------------------------------------- |
+| 任务类别       | 目标检测、全景分割             |                                             |
+| 模型           | detr                           |                                             |
+| 数据集         | coco2017                       |                                             |
+| 数据精度       | precision,见“性能指标”         | 可选fp32/amp/fp16                           |
+| 超参修改       | fix_hp,见“性能指标”            | 跑满硬件设备评测吞吐量所需特殊超参          |
+| 硬件设备简称   | nvidia A100                    |                                             |
+| 硬件存储使用   | mem(actual/total),见“性能指标” | 通常称为“显存”,单位为GiB                    |
+| 端到端时间     | e2e_time,见“性能指标”          | 总时间+Perf初始化等时间                     |
+| 总吞吐量       | p_whole,见“性能指标”           | 实际训练样本数除以总时间(performance_whole) |
+| 训练吞吐量     | p_train,见“性能指标”           | 不包含每个epoch末尾的评估部分耗时           |
+| **计算吞吐量** | **p_core,见“性能指标”**        | 不包含数据IO部分的耗时(p3>p2>p1)            |
+| 训练结果       | mAP,见“性能指标”               | mean Average Precision                      |
+| 额外修改项     | 无                             |                                             |
+
+
+* 性能指标
+  
+| 配置                | precision | fix_hp         | e2e_time | p_whole | p_train | p_core | mAP   | mem       |
+| ------------------- | --------- | -------------- | -------- | ------- | ------- | ------ | ----- | --------- |
+| MXC500 单机8卡（1x8） | fp32      | bs=8,lr=0.0001 |          |         |         |        |39.6%| 57.2/64.0 |
+| MXC500 单机单卡（1x1）| fp32      | /              |           |         |         |         |        | 60.7/64.0  |
+| MXC500 两机16卡（2x8） | fp32      | /              |            |         |         |         |        | 46.3/64.0  |
diff --git a/training/metax/detr-pytorch/config/config_C500x1x1.py b/training/metax/detr-pytorch/config/config_C500x1x1.py
new file mode 100644
index 000000000..f09c8b6c3
--- /dev/null
+++ b/training/metax/detr-pytorch/config/config_C500x1x1.py
@@ -0,0 +1,4 @@
+from config_common import *
+
+train_batch_size = 8
+eval_batch_size = 8
\ No newline at end of file
diff --git a/training/metax/detr-pytorch/config/config_C500x1x8.py b/training/metax/detr-pytorch/config/config_C500x1x8.py
new file mode 100644
index 000000000..f09c8b6c3
--- /dev/null
+++ b/training/metax/detr-pytorch/config/config_C500x1x8.py
@@ -0,0 +1,4 @@
+from config_common import *
+
+train_batch_size = 8
+eval_batch_size = 8
\ No newline at end of file
diff --git a/training/metax/detr-pytorch/config/config_C500x2x8.py b/training/metax/detr-pytorch/config/config_C500x2x8.py
new file mode 100644
index 000000000..f09c8b6c3
--- /dev/null
+++ b/training/metax/detr-pytorch/config/config_C500x2x8.py
@@ -0,0 +1,4 @@
+from config_common import *
+
+train_batch_size = 8
+eval_batch_size = 8
\ No newline at end of file
diff --git a/training/metax/detr-pytorch/config/config_common.py b/training/metax/detr-pytorch/config/config_common.py
new file mode 100644
index 000000000..851b29d4e
--- /dev/null
+++ b/training/metax/detr-pytorch/config/config_common.py
@@ -0,0 +1,2 @@
+vendor = "metax"
+dist_backend = "nccl"
\ No newline at end of file
diff --git a/training/metax/detr-pytorch/config/environment_variables.sh b/training/metax/detr-pytorch/config/environment_variables.sh
new file mode 100644
index 000000000..a7f429ac2
--- /dev/null
+++ b/training/metax/detr-pytorch/config/environment_variables.sh
@@ -0,0 +1,5 @@
+# =================================================
+# Export variables
+# =================================================
+
+export METAX_USE_TF32=1
diff --git a/training/metax/detr-pytorch/config/requirements.txt b/training/metax/detr-pytorch/config/requirements.txt
new file mode 100644
index 000000000..061205713
--- /dev/null
+++ b/training/metax/detr-pytorch/config/requirements.txt
@@ -0,0 +1,6 @@
+cython
+git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI&egg=pycocotools
+submitit
+scipy
+onnx
+onnxruntime
\ No newline at end of file
diff --git a/training/metax/detr-pytorch/extern/.gitkeep b/training/metax/detr-pytorch/extern/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/training/metax/glm-pytorch/README.md b/training/metax/glm-pytorch/README.md
new file mode 100644
index 000000000..0eaff257b
--- /dev/null
+++ b/training/metax/glm-pytorch/README.md
@@ -0,0 +1,46 @@
+### 模型Checkpoint下载
+[模型Checkpoint下载](../../benchmarks/glm/README.md#模型checkpoint)
+### 测试数据集下载
+[测试数据集下载](../../benchmarks/glm/README.md#数据集)
+
+###  沐曦集成电路 C500 GPU配置与运行信息参考
+#### 环境配置
+- ##### 硬件环境
+    - 机器、加速卡型号: 曦云®C500 64G
+    - 多机网络类型、带宽: InfiniBand，2x200 Gb/s
+- ##### 软件环境
+   - OS版本：Ubuntu 20.04.6
+   - OS kernel版本:  5.4.0-26-generic
+   - 加速卡驱动版本：2.2.0
+   - Docker 版本：24.0.7
+   - 训练框架版本：pytorch-2.0.0+mc2.18.0.8-cp38-cp38-linux_x86_64.whl
+   - 依赖软件版本：无
+
+
+### 运行情况
+* 通用指标
+
+| 指标名称         | 指标值                                       | 特殊说明                                    |
+| ---------------- | -------------------------------------------- | ------------------------------------------- |
+| 任务类别         | 自然语言理解、无条件文本生成、有条件文本生成 |                                             |
+| 模型             | GLM                                          |                                             |
+| 数据集           | superglue                                    |                                             |
+| 数据精度         | precision,见“性能指标”                       | 可选fp32/amp/fp16                           |
+| 超参修改         | fix_hp,见“性能指标”                          | 跑满硬件设备评测吞吐量所需特殊超参          |
+| 硬件设备简称     | MXC500                                  |                                             |
+| 硬件存储使用     | mem(actual/total),见“性能指标”               | 通常称为“显存”,单位为GiB                    |
+| 端到端时间       | e2e_time,见“性能指标”                        | 总时间+Perf初始化等时间                     |
+| 总吞吐量         | p_whole,见“性能指标”                         | 实际训练样本数除以总时间(performance_whole) |
+| 训练吞吐量       | p_train,见“性能指标”                         | 不包含每个epoch末尾的评估部分耗时           |
+| **计算吞吐量**   | **p_core,见“性能指标”**                      | 不包含数据IO部分的耗时(p3>p2>p1)            |
+| **计算卡使用率** | **\*MFU**                                    | model flops utilization                     |
+| 训练结果         | acc,见“性能指标”                             | 准确率                                      |
+| 额外修改项       | 无                                           |                                             |
+
+* 性能指标
+
+| 配置                | precision | fix_hp          | e2e_time | p_whole | p_train | p_core | acc   | mem       | MFU   |
+| ------------------- | --------- | --------------- | -------- | ------- | ------- | ------ | ----- | --------- | ----- |
+| C500单机8卡（1x8）  | fp32      | / |     |     |  |  | 0.802 | 54.5/64.0 |  |
+| C500单机单卡（1x1） | fp32      | / |     |    | |   | /     | 50.4/64.0 |  |
+| C500两机16卡（2x8） | fp32      | /  |     |  |    |   | /     | 29.8/64.0 | |
diff --git a/training/metax/glm-pytorch/config/config_C500x1x1.py b/training/metax/glm-pytorch/config/config_C500x1x1.py
new file mode 100644
index 000000000..3c1125e57
--- /dev/null
+++ b/training/metax/glm-pytorch/config/config_C500x1x1.py
@@ -0,0 +1,19 @@
+train_batch_size = 16
+eval_batch_size = 16
+
+max_samples_termination = 24135
+
+dist_backend = "nccl"
+
+lr = 1e-5
+weight_decay = 0.1
+adam_beta1 = 0.9
+adam_beta2 = 0.999
+adam_eps = 1e-08
+gradient_accumulation_steps = 1
+warmup = 0.1
+lr_decay_ratio = 0.1
+lr_decay_iters = 4338
+log_freq = 1
+
+training_event = None
diff --git a/training/metax/glm-pytorch/config/config_C500x1x8.py b/training/metax/glm-pytorch/config/config_C500x1x8.py
new file mode 100644
index 000000000..3cc3e2723
--- /dev/null
+++ b/training/metax/glm-pytorch/config/config_C500x1x8.py
@@ -0,0 +1,18 @@
+train_batch_size = 16
+eval_batch_size = 16
+
+dist_backend = "nccl"
+
+lr = 1e-5
+weight_decay = 0.1
+adam_beta1 = 0.9
+adam_beta2 = 0.999
+adam_eps = 1e-08
+gradient_accumulation_steps = 1
+warmup = 0.1
+lr_decay_ratio = 0.1
+lr_decay_iters = 4338
+log_freq = 1
+seed = 10483
+max_samples_termination = 5553080
+training_event = None
diff --git a/training/metax/glm-pytorch/config/config_C500x2x8.py b/training/metax/glm-pytorch/config/config_C500x2x8.py
new file mode 100644
index 000000000..a40988fed
--- /dev/null
+++ b/training/metax/glm-pytorch/config/config_C500x2x8.py
@@ -0,0 +1,22 @@
+fp16 = True
+ddp_type = "apex"
+train_batch_size = 8
+eval_batch_size = 8
+
+dist_backend = "nccl"
+
+lr = 1e-5
+weight_decay = 0.1
+adam_beta1 = 0.9
+adam_beta2 = 0.999
+adam_eps = 1e-08
+gradient_accumulation_steps = 1
+warmup = 0.1
+lr_decay_ratio = 0.1
+lr_decay_iters = 4338
+log_freq = 1
+
+training_event = None
+
+max_samples_termination = 1388270 * 4
+target_accuracy = 0.8
diff --git a/training/metax/glm-pytorch/config/requirements.txt b/training/metax/glm-pytorch/config/requirements.txt
new file mode 100644
index 000000000..3adfcca6c
--- /dev/null
+++ b/training/metax/glm-pytorch/config/requirements.txt
@@ -0,0 +1,3 @@
+h5sparse
+boto3
+h5py
diff --git a/training/metax/glm-pytorch/extern/converter.py b/training/metax/glm-pytorch/extern/converter.py
new file mode 100644
index 000000000..330bee79a
--- /dev/null
+++ b/training/metax/glm-pytorch/extern/converter.py
@@ -0,0 +1,21 @@
+from driver import dist_pytorch
+from .layers.transformer import GLMTransformer
+
+
+def convert_model(model, config):
+    if dist_pytorch.get_rank() == 0:
+        print("use apex layer norm", flush=True)
+    state_dict = model.state_dict()
+    transformer_layer = GLMTransformer(
+        num_layers=config.num_layers,
+        hidden_size=config.hidden_size,
+        num_attention_heads=config.num_attention_heads,
+        max_sequence_length=config.max_seq_length,
+        max_memory_length=config.max_memory_length,
+        embedding_dropout_prob=config.hidden_dropout,
+        attention_dropout_prob=config.attention_dropout,
+        output_dropout_prob=config.hidden_dropout,
+        checkpoint_activations=config.checkpoint_activations)
+    model.model.transformer = transformer_layer
+    model.load_state_dict(state_dict, strict=True)
+    return model
diff --git a/training/metax/glm-pytorch/extern/layers/__init__.py b/training/metax/glm-pytorch/extern/layers/__init__.py
new file mode 100644
index 000000000..dab9da7e4
--- /dev/null
+++ b/training/metax/glm-pytorch/extern/layers/__init__.py
@@ -0,0 +1 @@
+from .transformer import *
diff --git a/training/metax/glm-pytorch/extern/layers/layernorm.py b/training/metax/glm-pytorch/extern/layers/layernorm.py
new file mode 100644
index 000000000..96a935cc6
--- /dev/null
+++ b/training/metax/glm-pytorch/extern/layers/layernorm.py
@@ -0,0 +1 @@
+from apex.normalization import FusedLayerNorm as LayerNorm
diff --git a/training/metax/glm-pytorch/extern/layers/transformer.py b/training/metax/glm-pytorch/extern/layers/transformer.py
new file mode 100644
index 000000000..82e98ed26
--- /dev/null
+++ b/training/metax/glm-pytorch/extern/layers/transformer.py
@@ -0,0 +1,298 @@
+import torch
+import math
+
+from .transformer_block import GLMTransformerLayer
+from .layernorm import LayerNorm
+from model.models.checkpoint import checkpoint
+
+
+def scaled_init_method(sigma, num_layers):
+    """Init method based on N(0, sigma/sqrt(2*num_layers)."""
+    std = sigma / math.sqrt(2.0 * num_layers)
+
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=std)
+
+    return init_
+
+
+def unscaled_init_method(sigma):
+    """Init method based on N(0, sigma)."""
+
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
+
+    return init_
+
+
+class GLMTransformer(torch.nn.Module):
+    """GPT-2 transformer.
+
+    This module takes input from embedding layer and it's output can
+    be used directly by a logit layer. It consists of L (num-layers)
+    blocks of:
+        layer norm
+        self attention
+        residual connection
+        layer norm
+        mlp
+        residual connection
+    followed by a final layer norm.
+
+    Arguments:
+        num_layers: Number of transformer layers.
+        hidden_size: The hidden size of the self attention.
+        num_attention_heads: number of attention head in the self
+                             attention.
+        attention_dropout_prob: dropout probability of the attention
+                                score in self attention.
+        output_dropout_prob: dropout probability for the outputs
+                             after self attention and final output.
+        checkpoint_activations: if True, checkpoint activations.
+        checkpoint_num_layers: number of layers to checkpoint. This
+                               is basically the chunk size in checkpoitning.
+        layernorm_epsilon: epsilon used in layernorm to avoid
+                           division by zero.
+        init_method_std: standard deviation of the init method which has
+                         the form N(0, std).
+        use_scaled_init_for_output_weights: If Ture use 1/sqrt(2*num_layers)
+                                            scaling for the output weights (
+                                            output of self attention and mlp).
+    """
+
+    def __init__(
+        self,
+        num_layers,
+        hidden_size,
+        num_attention_heads,
+        max_sequence_length,
+        max_memory_length,
+        embedding_dropout_prob,
+        attention_dropout_prob,
+        output_dropout_prob,
+        checkpoint_activations,
+        checkpoint_num_layers=1,
+        layernorm_epsilon=1.0e-5,
+        init_method_std=0.02,
+        use_scaled_init_for_output_weights=True,
+        block_position_encoding=True,
+        attention_scale=1.0,
+    ):
+        super(GLMTransformer, self).__init__()
+        self.hidden_size = hidden_size
+        # Store activation checkpoiting flag.
+        self.checkpoint_activations = checkpoint_activations
+        self.checkpoint_num_layers = checkpoint_num_layers
+        self.max_memory_length = max_memory_length
+
+        output_layer_init_method = None
+        if use_scaled_init_for_output_weights:
+            output_layer_init_method = scaled_init_method(
+                init_method_std, num_layers)
+        # Embeddings dropout
+        self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)
+        self.block_position_encoding = block_position_encoding
+
+        # Position embedding (serial).
+        if block_position_encoding:
+            self.position_embeddings = torch.nn.Embedding(
+                max_sequence_length + 1, hidden_size)
+            self.block_position_embeddings = torch.nn.Embedding(
+                max_sequence_length + 1, hidden_size)
+            torch.nn.init.normal_(self.block_position_embeddings.weight,
+                                  mean=0.0,
+                                  std=init_method_std)
+        else:
+            self.position_embeddings = torch.nn.Embedding(
+                max_sequence_length, hidden_size)
+        # Initialize the position embeddings.
+        torch.nn.init.normal_(self.position_embeddings.weight,
+                              mean=0.0,
+                              std=init_method_std)
+
+        def get_layer():
+
+            return GLMTransformerLayer(
+                hidden_size,
+                num_attention_heads,
+                attention_dropout_prob,
+                output_dropout_prob,
+                layernorm_epsilon,
+                unscaled_init_method(init_method_std),
+                output_layer_init_method=output_layer_init_method,
+                relative_encoding=False,
+                performer=False,
+                attention_scale=attention_scale)
+
+        # Transformer layers.
+        self.layers = torch.nn.ModuleList(
+            [get_layer() for _ in range(num_layers)])
+
+        # Final layer norm before output.
+        self.final_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon)
+
+        # if deepspeed.checkpointing.is_configured():
+        #     global get_cuda_rng_tracker, checkpoint
+        #     get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker
+        #     checkpoint = deepspeed.checkpointing.checkpoint
+
+    def forward(self,
+                hidden_states,
+                position_ids,
+                attention_mask,
+                memory_states=None,
+                encoder_states=None,
+                return_memory=False,
+                detach_memory=True):
+        batch_size, query_length = hidden_states.size()[:2]
+        memory_length = memory_states[0].size(1) if memory_states else 0
+        key_length = query_length + memory_length
+        # attention mask is the beginning postion of B region, \in [0, query_len)
+        is_scalar = torch.numel(attention_mask) == 1
+        is_sep = is_scalar or torch.numel(attention_mask) == batch_size
+
+        if is_sep:
+            sep = attention_mask.item() if is_scalar else attention_mask
+
+            # conventional transformer
+            def build_mask_matrix(seq_length, sep, memory_length=0):
+                m = hidden_states.new_ones((1, seq_length, seq_length))
+                m = torch.tril(m)
+                if is_scalar:
+                    m[0, :, :sep] = 1
+                else:
+                    m = m.expand(batch_size, -1, -1)
+                    ids = torch.arange(seq_length,
+                                       device=sep.device,
+                                       dtype=sep.dtype).view(1, -1)
+                    mask = ids < sep.view(-1, 1)
+                    m = m.masked_fill(mask.unsqueeze(1).expand_as(m), 1)
+                if memory_length > 0:
+                    m = m.expand(batch_size, -1, -1)
+                    m = torch.cat((hidden_states.new_ones(
+                        (batch_size, seq_length, memory_length)), m),
+                                  dim=2)
+                m = m.unsqueeze(1)
+                return m
+
+            attention_mask = build_mask_matrix(query_length,
+                                               sep,
+                                               memory_length=memory_length)
+        else:
+            attention_mask = attention_mask[:, :, :,
+                                            -query_length - memory_length:]
+
+        if self.block_position_encoding:
+            position_ids, block_position_ids = position_ids[:,
+                                                            0], position_ids[:,
+                                                                             1]
+        position_embeddings = self.position_embeddings(position_ids)
+        hidden_states = hidden_states + position_embeddings
+        if self.block_position_encoding:
+            block_position_embeddings = self.block_position_embeddings(
+                block_position_ids)
+            hidden_states = hidden_states + block_position_embeddings
+        hidden_states = self.embedding_dropout(hidden_states)
+
+        def check_detach(_hidden_states):
+            if detach_memory:
+                return _hidden_states.detach()
+            return _hidden_states
+
+        if self.max_memory_length > 0 or return_memory:
+            mem_layers = [check_detach(hidden_states)]
+        else:
+            mem_layers = []
+
+        def custom(start, end):
+
+            def custom_forward(*inputs):
+                layers_ = self.layers[start:end]
+                x_, inputs = inputs[0], inputs[1:]
+
+                inputs, mems_ = inputs[:1], inputs[1:]
+                for i, layer in enumerate(layers_):
+                    mem_i_ = mems_[i] if mems_ else None
+                    x_ = layer(x_, *inputs, mem=mem_i_)
+                    if self.max_memory_length > 0 or return_memory:
+                        mem_layers.append(check_detach(x_))
+                return x_
+
+            return custom_forward
+
+        if self.checkpoint_activations:
+            l = 0
+            num_layers = len(self.layers)
+            chunk_length = self.checkpoint_num_layers
+            while l < num_layers:
+                args = [hidden_states, attention_mask]
+                if memory_states:
+                    args += memory_states[l:l + chunk_length]
+                hidden_states = checkpoint(custom(l, l + chunk_length), *args)
+                l += chunk_length
+        else:
+            for i, layer in enumerate(self.layers):
+                args = [hidden_states, attention_mask]
+                mem_i = memory_states[i] if memory_states else None
+                hidden_states = layer(*args, mem=mem_i)
+                if self.max_memory_length > 0 or return_memory:
+                    mem_layers.append(check_detach(hidden_states))
+
+        # Final layer norm.
+        output = self.final_layernorm(hidden_states)
+        if self.max_memory_length > 0 or return_memory:
+            mem_layers = self.update_mems(mem_layers,
+                                          memory_states,
+                                          return_memory=return_memory)
+
+        return (output, mem_layers)
+
+    def update_mems(self, hiddens, mems, return_memory=False):
+        memory_length = mems[0].size(1) if mems else 0
+        query_length = hiddens[0].size(1)
+        new_memory_length = memory_length + query_length
+        if not return_memory:
+            new_memory_length = min(self.max_memory_length, new_memory_length)
+        new_mems = []
+        # with torch.no_grad():
+        for i in range(len(hiddens)):
+            if new_memory_length <= query_length:
+                new_mems.append(hiddens[i][:, -new_memory_length:])
+            else:
+                new_mems.append(
+                    torch.cat((mems[i][:, -new_memory_length + query_length:],
+                               hiddens[i]),
+                              dim=1))
+        return new_mems
+
+
+if __name__ == "__main__":
+
+    batch_size = 2
+    seq_len = 512
+    hidden_size = 1024
+    hidden_states = torch.rand([batch_size, seq_len, hidden_size],
+                               dtype=torch.float32).to("cuda")
+    position_ids = torch.ones([batch_size, 2, seq_len],
+                              dtype=torch.int64).to('cuda')
+    attention_mask = torch.tensor([5, 10]).to('cuda')
+
+    model = GLMTransformer(num_layers=24,
+                           hidden_size=1024,
+                           num_attention_heads=16,
+                           max_sequence_length=512,
+                           max_memory_length=0,
+                           embedding_dropout_prob=0.1,
+                           attention_dropout_prob=0.1,
+                           output_dropout_prob=0.1,
+                           checkpoint_activations=True,
+                           checkpoint_num_layers=1,
+                           layernorm_epsilon=1.0e-5,
+                           init_method_std=0.02,
+                           use_scaled_init_for_output_weights=True,
+                           block_position_encoding=True,
+                           attention_scale=1.0).to('cuda')
+
+    outputs = model(hidden_states, position_ids, attention_mask)
+    print(outputs[0].shape)
+    print(outputs[1])
diff --git a/training/metax/glm-pytorch/extern/layers/transformer_block.py b/training/metax/glm-pytorch/extern/layers/transformer_block.py
new file mode 100644
index 000000000..c3d33466b
--- /dev/null
+++ b/training/metax/glm-pytorch/extern/layers/transformer_block.py
@@ -0,0 +1,125 @@
+import torch
+
+from model.layers.attention import SelfAttention
+from .layernorm import LayerNorm
+from model.layers.mlp import GLMMLP
+
+
+class GLMTransformerLayer(torch.nn.Module):
+    """A single layer transformer for GPT2.
+
+    We use the following notation:
+        h: hidden size
+        n: number of attention heads
+        b: batch size
+        s: sequence length
+    Transformore layer takes input with size [b, s, h] and returns an
+    output of the same size.
+
+    Arguments:
+        hidden_size: The hidden size of the self attention.
+        num_attention_heads: number of attention head in the self
+                             attention.
+        attention_dropout_prob: dropout probability of the attention
+                                score in self attention.
+        output_dropout_prob: dropout probability for the outputs
+                             after self attention and final output.
+        layernorm_epsilon: epsilon used in layernorm to avoid
+                           division by zero.
+        init_method: initialization method used for the weights. Note
+                     that all biases are initialized to zero and
+                     layernorm weight are initialized to one.
+        output_layer_init_method: output layers (attention output and
+                                  mlp output) initialization. If None,
+                                  use `init_method`.
+    """
+
+    def __init__(self,
+                 hidden_size,
+                 num_attention_heads,
+                 attention_dropout_prob,
+                 output_dropout_prob,
+                 layernorm_epsilon,
+                 init_method,
+                 output_layer_init_method=None,
+                 relative_encoding=False,
+                 performer=False,
+                 attention_scale=1.0):
+        super(GLMTransformerLayer, self).__init__()
+        # Set output layer initialization if not provided.
+        if output_layer_init_method is None:
+            output_layer_init_method = init_method
+
+        # Layernorm on the input data.
+        self.input_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon)
+
+        # Self attention.
+        self.attention = SelfAttention(
+            hidden_size,
+            num_attention_heads,
+            attention_dropout_prob,
+            output_dropout_prob,
+            init_method,
+            output_layer_init_method=output_layer_init_method,
+            relative_encoding=relative_encoding,
+            performer=performer,
+            attention_scale=attention_scale)
+
+        # Layernorm on the input data.
+        self.post_attention_layernorm = LayerNorm(hidden_size,
+                                                  eps=layernorm_epsilon)
+
+        # MLP
+        self.mlp = GLMMLP(hidden_size,
+                          output_dropout_prob,
+                          init_method,
+                          output_layer_init_method=output_layer_init_method)
+
+    def forward(self,
+                hidden_states,
+                ltor_mask,
+                position_embeddings=None,
+                r_w_bias=None,
+                r_r_bias=None,
+                mem=None):
+        # hidden_states: [b, s, h]
+        # ltor_mask: [1, 1, s, s]
+
+        # Layer norm at the begining of the transformer layer.
+        layernorm_output = self.input_layernorm(hidden_states)
+        mem = self.input_layernorm(mem) if mem is not None else None
+        # Self attention.
+        attention_output = self.attention(layernorm_output, ltor_mask,
+                                          position_embeddings, r_w_bias,
+                                          r_r_bias, mem)
+        # Residual connection.
+        layernorm_input = hidden_states + attention_output
+        # Layer norm post the self attention.
+        layernorm_output = self.post_attention_layernorm(layernorm_input)
+        # MLP.
+        mlp_output = self.mlp(layernorm_output)
+        # Second residual connection.
+        output = layernorm_input + mlp_output
+
+        return output
+
+
+if __name__ == "__main__":
+    batch_size = 8
+    seq_len = 512
+    hidden_size = 1024
+    num_attention_heads = 16
+    attention_dropout_prob = 0.1
+    output_dropout_prob = 0.1
+    layernorm_epsilon = 1e-10
+    init_method = torch.nn.init.xavier_normal_
+    test_transformer = GLMTransformerLayer(hidden_size, num_attention_heads,
+                                           attention_dropout_prob,
+                                           output_dropout_prob,
+                                           layernorm_epsilon, init_method)
+
+    hidden_states = torch.rand([batch_size, seq_len, hidden_size])
+    ltor_mask = torch.ones([1, 1, seq_len, seq_len])
+
+    outputs = test_transformer(hidden_states, ltor_mask)
+    print(outputs.shape)
diff --git a/training/metax/glm-pytorch/extern/trainer_adapter.py b/training/metax/glm-pytorch/extern/trainer_adapter.py
new file mode 100644
index 000000000..d4c3837a5
--- /dev/null
+++ b/training/metax/glm-pytorch/extern/trainer_adapter.py
@@ -0,0 +1,80 @@
+import torch
+import config
+
+from torch import nn
+
+from .converter import convert_model as _convert_model
+from driver.dist_pytorch import main_proc_print
+from typing import Tuple
+from model.models.modeling import FP16_Module
+from driver.dist_pytorch import PyTorchDistributedDataParallel as TorchDDP
+
+from optimizers.loss_scaler import DynamicLossScaler
+
+clip_grad_norm = torch.nn.utils.clip_grad_norm_
+
+
+def convert_model(model: torch.nn.Module) -> torch.nn.Module:
+    return _convert_model(model, config)
+
+
+def model_to_fp16(model):
+    # To prevent OOM for model sizes that cannot fit in GPU memory in full precision
+    if config.fp16:
+        main_proc_print(" > use fp16...")
+        model.half()
+
+    # GPU allocation.
+    model.cuda(torch.cuda.current_device())
+
+    # Fp16 conversion.
+    if config.fp16:
+        model = FP16_Module(model)
+    return model
+
+
+def model_to_ddp(model: nn.Module) -> nn.Module:
+    i = torch.cuda.current_device()
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        model = TorchDDP(model, device_ids=[i], output_device=i)
+    return model
+
+
+def backward(step, lm_loss, reduced_loss, optimizer, lr_scheduler, model):
+    args = config
+
+    if not DynamicLossScaler._has_inf_or_nan(reduced_loss):
+        backward_step(optimizer, model, lm_loss, args)
+        if step % args.gradient_accumulation_steps == 0:
+            optimizer.step()
+            if not (args.fp16 and optimizer.overflow):
+                lr_scheduler.step()
+            optimizer.zero_grad()
+
+    else:
+        main_proc_print("Found NaN loss, skip backward")
+    return reduced_loss
+
+
+def backward_step(optimizer, model, lm_loss, args):
+    """Backward step."""
+
+    # Total loss.
+    loss = lm_loss
+
+    if args.fp16:
+        optimizer.backward(loss, update_master_grads=False)
+    else:
+        loss.backward()
+
+    if args.fp16:
+        optimizer.update_master_grads()
+
+    # Clipping gradients helps prevent the exploding gradient.
+    if args.clip_grad > 0:
+        if not args.fp16:
+            clip_grad_norm(model.parameters(), args.clip_grad)
+        else:
+            optimizer.clip_master_grads(args.clip_grad)
+
+    return lm_loss
diff --git a/training/metax/mask_rcnn-pytorch/README.md b/training/metax/mask_rcnn-pytorch/README.md
new file mode 100644
index 000000000..3247c5e42
--- /dev/null
+++ b/training/metax/mask_rcnn-pytorch/README.md
@@ -0,0 +1,50 @@
+### 模型backbone权重下载
+[模型backbone权重下载](../../benchmarks/mask_rcnn) 
+
+### 测试数据集下载
+
+[测试数据集下载](https://cocodataset.org/)
+
+### 沐曦集成电路 C500 GPU配置与运行信息参考
+#### 环境配置
+- ##### 硬件环境
+    - 机器、加速卡型号: 曦云®C500 64G
+    - 多机网络类型、带宽: InfiniBand，2x200 Gb/s
+
+- ##### 软件环境
+   - OS版本：Ubuntu 20.04.6
+   - OS kernel版本:  5.4.0-26-generic
+   - 加速卡驱动版本：2.2.0
+   - Docker 版本：24.0.7
+   - 训练框架版本：pytorch-2.0.0+mc2.19.0.6-cp38-cp38-linux_x86_64.whl
+   - 依赖软件版本：无
+
+
+
+
+* 通用指标
+
+| 指标名称       | 指标值                  | 特殊说明                                    |
+| -------------- | ----------------------- | ------------------------------------------- |
+| 任务类别       | 图像目标检测            |                                             |
+| 模型           | fasterRCNN              |                                             |
+| 数据集         | coco2017                |                                             |
+| 数据精度       | precision,见“性能指标”  | 可选fp32/amp/fp16                           |
+| 超参修改       | fix_hp,见“性能指标”     | 跑满硬件设备评测吞吐量所需特殊超参          |
+| 硬件设备简称   | MXC500                 |                                             |
+| 硬件存储使用   | mem,见“性能指标”        | 通常称为“显存”,单位为GiB                    |
+| 端到端时间     | e2e_time,见“性能指标”   | 总时间+Perf初始化等时间                     |
+| 总吞吐量       | p_whole,见“性能指标”    | 实际训练图片数除以总时间(performance_whole) |
+| 训练吞吐量     | p_train,见“性能指标”    | 不包含每个epoch末尾的评估部分耗时           |
+| **计算吞吐量** | **p_core,见“性能指标”** | 不包含数据IO部分的耗时(p3>p2>p1)            |
+| 训练结果       | map,见“性能指标”        | 单位为平均目标检测正确率                    |
+| 额外修改项     | 无                      |                                             |
+
+
+* 性能指标
+
+| 配置                | precision | fix_hp         | e2e_time | p_whole | p_train | p_core | mAP   | mem       |
+| ------------------- | --------- | -------------- | -------- | ------- | ------- | ------ | ----- | --------- |
+| MXC500 单机8卡（1x8） | fp32      | bs=8,lr=0.0001 |          |         |         |        |0.382 && 0.343| 37.1/64.0 |
+| MXC500 单机单卡（1x1）| fp32      | /              |           |         |         |         |        | 36.2/64.0  |
+| MXC500 两机16卡（2x8） | fp32      | /              |            |         |         |         |        | 37.1/64.0  |
diff --git a/training/metax/mask_rcnn-pytorch/config/config_C500x1x1.py b/training/metax/mask_rcnn-pytorch/config/config_C500x1x1.py
new file mode 100644
index 000000000..b14441400
--- /dev/null
+++ b/training/metax/mask_rcnn-pytorch/config/config_C500x1x1.py
@@ -0,0 +1,5 @@
+vendor: str = "metax"
+train_batch_size = 16
+eval_batch_size = 16
+lr = 0.16
+max_epoch: int = 1
\ No newline at end of file
diff --git a/training/metax/mask_rcnn-pytorch/config/config_C500x1x8.py b/training/metax/mask_rcnn-pytorch/config/config_C500x1x8.py
new file mode 100644
index 000000000..c11690f00
--- /dev/null
+++ b/training/metax/mask_rcnn-pytorch/config/config_C500x1x8.py
@@ -0,0 +1,4 @@
+vendor: str = "metax"
+train_batch_size = 16
+eval_batch_size = 16
+lr = 0.16
\ No newline at end of file
diff --git a/training/metax/mask_rcnn-pytorch/config/config_C500x2x8.py b/training/metax/mask_rcnn-pytorch/config/config_C500x2x8.py
new file mode 100644
index 000000000..e81bc64bb
--- /dev/null
+++ b/training/metax/mask_rcnn-pytorch/config/config_C500x2x8.py
@@ -0,0 +1,5 @@
+vendor: str = "metax"
+train_batch_size = 16
+eval_batch_size = 16
+lr = 0.016
+max_epoch: int = 4
\ No newline at end of file
diff --git a/training/metax/mask_rcnn-pytorch/config/environment_variables.sh b/training/metax/mask_rcnn-pytorch/config/environment_variables.sh
new file mode 100644
index 000000000..a7f429ac2
--- /dev/null
+++ b/training/metax/mask_rcnn-pytorch/config/environment_variables.sh
@@ -0,0 +1,5 @@
+# =================================================
+# Export variables
+# =================================================
+
+export METAX_USE_TF32=1
diff --git a/training/metax/mask_rcnn-pytorch/config/requirements.txt b/training/metax/mask_rcnn-pytorch/config/requirements.txt
new file mode 100644
index 000000000..846b45e40
--- /dev/null
+++ b/training/metax/mask_rcnn-pytorch/config/requirements.txt
@@ -0,0 +1,4 @@
+pycocotools
+numpy
+tqdm
+schedule
\ No newline at end of file
diff --git a/training/metax/mask_rcnn-pytorch/extern/.gitkeep b/training/metax/mask_rcnn-pytorch/extern/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py
index 8b12afec9..27d0d79b0 100644
--- a/training/run_benchmarks/config/test_conf.py
+++ b/training/run_benchmarks/config/test_conf.py
@@ -134,7 +134,8 @@
     # "transformer:pytorch:R300:1:8:1": "/raid/dataset/transformer/wmt14_en_de_joined_dict",
     # "bigtransfer:pytorch:R300:1:8:1": "/raid/dataset/ImageNet_1k_2012/",
     # "efficientnet:pytorch:R300:1:8:1": "/raid/dataset/ImageNet_1k_2012/",
-  
+    # "llama2_70B:megatron:R300:10:8:1": "/raid/dataset/llama2_70B_pretrain",
+
     # iluvatar cases
     # "bigtransfer:pytorch:BI-V100:1:8:1": "/raid/dataset/ImageNet_1k_2012/",
     # "vit:pytorch:BI-V100:1:8:1": "/raid/dataset/ImageNet_1k_2012/",
@@ -154,6 +155,10 @@
     # "resnet50:pytorch_2.0:C500:1:8:1": "/raid/dataset/ImageNet_1k_2012/",
     # "swin_transformer:pytorch_2.0:C500:1:8:1": "/raid/dataset/ImageNet_1k_2012/",
     # "WaveGlow:pytorch_2.0:C500:1:8:1": "/raid/dataset/LJSpeech/",
+    # "bert_hf:pytorch_2.0:C500:1:8:1": "/raid/dataset/bert_hf_train",
+    # "glm:pytorch_2.0:C500:1:8:1": "/raid/home_datasets_ckpt/glm/train/",
     # "mobilenetv2:pytorch_2.0:C500:1:8:1": "/raid/dataset/ImageNet_1k_2012/",
+    # "mask_rcnn:pytorch_2.0:C500:1:8:1": "/raid/dataset/coco2017/",
+    # "detr:pytorch_2.0:C500:1:8:1": "/raid/dataset/coco2017/",
 }