From 71813f73701e89135b845b2d6f1e18377fa8c232 Mon Sep 17 00:00:00 2001 From: FaJingyi Date: Tue, 5 Mar 2024 11:15:43 +0800 Subject: [PATCH 1/4] [metax] swintransformer-inference pr (#473) * add metax swin-transformer * mod readme * mod readme * mod swin * Update README.md * Update config_common.py * Update requirements.txt * fix torch_six in swin_transformer * Update utils.py * add metax swintrans-infer --------- Co-authored-by: jingyifa --- inference/benchmarks/swinTransformer/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/inference/benchmarks/swinTransformer/README.md b/inference/benchmarks/swinTransformer/README.md index 14304fed9..ba517c573 100644 --- a/inference/benchmarks/swinTransformer/README.md +++ b/inference/benchmarks/swinTransformer/README.md @@ -84,4 +84,5 @@ find ./val -name "*JPEG" | wc -l | ----------- | --------- | ---- | ---- | -------- | ----------- | ---------- | ------------- | ------------ | ----------- | ----------- | | tensorrt | fp16 | 512 |1011.7 | 1347.5 | 1511.3 | 1231.7 | 1359.1 | 6.8% | 81.7/83.2 | 19.9/40.0 | | tensorrt | fp32 | 256 | 856.9 | 761.5 | 794.3 | 789.2 | 826.4 | 8.2% | 83.2/83.2 | 20.0/40.0 | -| kunlunxin_xtcl| W32A16 | 256 | 543.745 | / | / | / | / | / | 0.832 | / | +| kunlunxin_xtcl| W32A16 | 256 | / | / | / | / | / | / | 0.832 | / | +| metax-nocompiler| fp16 | 512 | / | / | / | / | / | 6.5% | 0.832 |10.6/64.0 | From 61cd337663c1578455f2b72b63ce37640dfa45e7 Mon Sep 17 00:00:00 2001 From: Rayyyyy <109121546+RRRRRayyyyy@users.noreply.github.com> Date: Tue, 5 Mar 2024 16:54:39 +0800 Subject: [PATCH 2/4] [DCU]Add glm case of dcu in FlagPerf. (#472) * Add glm case of dcu in Flagperf. * update 1*1 log * Update README infos in glm_pytorch of DCU. --------- Co-authored-by: shh2000 <13820618441@163.com> --- training/benchmarks/driver/helper.py | 6 + training/dcu/README.md | 66 ++++ training/dcu/dcu_monitor.py | 289 +++++++++++++++++ .../dcu/docker_image/pytorch_1.13/Dockerfile | 5 + .../pytorch_1.13/pytorch1.13_install.sh | 10 + training/dcu/glm-pytorch/README.md | 48 +++ .../dcu/glm-pytorch/config/config_K100x1x1.py | 19 ++ .../dcu/glm-pytorch/config/config_K100x1x8.py | 18 ++ .../config/environment_variables.sh | 6 + .../dcu/glm-pytorch/config/requirements.txt | 3 + training/dcu/glm-pytorch/extern/converter.py | 21 ++ .../dcu/glm-pytorch/extern/layers/__init__.py | 1 + .../glm-pytorch/extern/layers/layernorm.py | 1 + .../glm-pytorch/extern/layers/transformer.py | 298 ++++++++++++++++++ .../extern/layers/transformer_block.py | 125 ++++++++ .../dcu/glm-pytorch/extern/trainer_adapter.py | 80 +++++ training/run_benchmarks/config/test_conf.py | 13 +- 17 files changed, 1006 insertions(+), 3 deletions(-) create mode 100644 training/dcu/README.md create mode 100644 training/dcu/dcu_monitor.py create mode 100644 training/dcu/docker_image/pytorch_1.13/Dockerfile create mode 100644 training/dcu/docker_image/pytorch_1.13/pytorch1.13_install.sh create mode 100644 training/dcu/glm-pytorch/README.md create mode 100644 training/dcu/glm-pytorch/config/config_K100x1x1.py create mode 100644 training/dcu/glm-pytorch/config/config_K100x1x8.py create mode 100644 training/dcu/glm-pytorch/config/environment_variables.sh create mode 100644 training/dcu/glm-pytorch/config/requirements.txt create mode 100644 training/dcu/glm-pytorch/extern/converter.py create mode 100644 training/dcu/glm-pytorch/extern/layers/__init__.py create mode 100644 training/dcu/glm-pytorch/extern/layers/layernorm.py create mode 100644 training/dcu/glm-pytorch/extern/layers/transformer.py create mode 100644 training/dcu/glm-pytorch/extern/layers/transformer_block.py create mode 100644 training/dcu/glm-pytorch/extern/trainer_adapter.py diff --git a/training/benchmarks/driver/helper.py b/training/benchmarks/driver/helper.py index 545e83f19..338b3d8ac 100644 --- a/training/benchmarks/driver/helper.py +++ b/training/benchmarks/driver/helper.py @@ -88,6 +88,12 @@ def set_seed(self, seed: int, vendor: str = None): torch.backends.cudnn.benchmark = getattr(config, "cudnn_benchmark") torch.backends.cudnn.deterministic = getattr( config, "cudnn_deterministic") + elif lower_vendor == "dcu": + import torch + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + torch.backends.cudnn.benchmark = True else: # TODO 其他厂商设置seed,在此扩展 pass diff --git a/training/dcu/README.md b/training/dcu/README.md new file mode 100644 index 000000000..93478c9a7 --- /dev/null +++ b/training/dcu/README.md @@ -0,0 +1,66 @@ +# 厂商信息 + +海光 DCU 系列产品以 GPGPU 架构为基础,兼容通用的“类 CUDA” 环境以及国际主流商业计算软件和人工智能软件,软硬件生态丰富,可广泛应用于大数据处理、人工智能、商业计算等应用领域。 + +海光 DCU 兼容“类 CUDA” 环境, 软硬件生态丰富,典型应用场景下性能指标达到国际上同类型高端产品的水平。 + +海光 DCU 主要面向大数据处理、商业计算等计算密集型应用领域,以及人工智能、 泛人工智能类运算加速领域。 + +# FlagPerf适配验证环境说明 +## 环境配置参考 + - 硬件 + - 机器型号:K100 标准机 + - 加速卡型号: K100 64G + - 软件 + - OS版本:centos 7.6 + - OS kernel版本: 4.18.0-348.el8.0.2.x86_64 + - Docker 版本: 24.0.7 + +## 容器镜像信息 +- 容器构建信息 + - Dockerfile路径:training/dcu/docker_image/\/Dockerfile + - 构建后软件安装脚本: training/dcu/docker_image/\/\_install.sh + +- 核心软件信息 + + - AI框架&版本 + - torch: 1.13.1 + + - 其它软件版本 + - dtk: 23.10.1 + + +## 加速卡监控采集 +- 加速卡使用信息采集命令 + + dcu_monitor.py中79行需要修改为实际source的地址 + + ``` + source /path/of/dtk/env.sh + rocm-smi + ``` + +- 监控项示例: + + ``` + ============================ System Management Interface ============================= + ====================================================================================== + DCU Temp AvgPwr Perf PwrCap VRAM% DCU% Mode + 0 53.0C 96.0W auto 300.0W 0% 0% Normal + 1 53.0C 96.0W auto 300.0W 0% 0% Normal + 2 54.0C 95.0W auto 300.0W 0% 0% Normal + 3 55.0C 96.0W auto 300.0W 0% 0% Normal + 4 54.0C 97.0W auto 300.0W 0% 0% Normal + 5 54.0C 95.0W auto 300.0W 0% 0% Normal + 6 55.0C 93.0W auto 300.0W 0% 0% Normal + 7 54.0C 96.0W auto 300.0W 0% 0% Normal + ====================================================================================== + =================================== End of SMI Log =================================== + ``` + +- 加速卡使用信息采集项说明 + +|监控项| 日志文件 | +|---|---| +|VRAM(%) | dcu_monitor.log | +|DCU(%) | dcu_monitor.log | \ No newline at end of file diff --git a/training/dcu/dcu_monitor.py b/training/dcu/dcu_monitor.py new file mode 100644 index 000000000..4572a082f --- /dev/null +++ b/training/dcu/dcu_monitor.py @@ -0,0 +1,289 @@ +# !/usr/bin/env python3 +# encoding: utf-8 +''' +Usage: python3 sys-monitor.py -o operation -l [log_path] + -o, --operation start|stop|restart|status + -l, --log log path , ./logs/ default +''' + +import os +import sys +import time +import signal +import atexit +import argparse +import datetime +from multiprocessing import Process +import subprocess +import schedule + + +class Daemon: + ''' + daemon subprocess class. + usage: subclass this daemon and override the run() method. + sys-monitor.pid: in the /tmp/, auto del when unexpected exit. + verbose: debug mode, disabled default. + ''' + + def __init__(self, + pid_file, + log_file, + err_file, + dcu_log, + log_path, + rate=5, + stdin=os.devnull, + stdout=os.devnull, + stderr=os.devnull, + home_dir='.', + umask=0o22, + verbose=0): + self.stdin = stdin + self.stdout = stdout + self.stderr = stderr + self.home_dir = home_dir + self.verbose = verbose + self.pidfile = pid_file + self.logfile = log_file + self.errfile = err_file + self.dcufile = dcu_log + self.logpath = log_path + self.rate = rate + self.umask = umask + self.verbose = verbose + self.daemon_alive = True + + def get_pid(self): + try: + with open(self.pidfile, 'r') as pf: + pid = int(pf.read().strip()) + except IOError: + pid = None + except SystemExit: + pid = None + return pid + + def del_pid(self): + if os.path.exists(self.pidfile): + os.remove(self.pidfile) + + def run(self): + ''' + NOTE: override the method in subclass + ''' + + def dcu_mon(file): + TIMESTAMP = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S') + # 修改本地DTK路径 + cmd = "source /path/of/dtk/env.sh; rocm-smi" + process = subprocess.Popen(cmd, + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + encoding='utf-8') + try: + out = process.communicate(timeout=10) + except subprocess.TimeoutExpired: + process.kill() + out = process.communicate() + + if process.returncode != 0: + result = "error" + result = TIMESTAMP + "\n" + out[0] + "\n" + with open(file, 'a') as f: + f.write(result) + + def timer_dcu_mon(): + dcu_process = Process(target=dcu_mon, args=(self.dcufile, )) + dcu_process.start() + + schedule.every(self.rate).seconds.do(timer_dcu_mon) + while True: + schedule.run_pending() + time.sleep(5) + + def daemonize(self): + if self.verbose >= 1: + print('daemon process starting ...') + try: + pid = os.fork() + if pid > 0: + sys.exit(0) + except OSError as e: + sys.stderr.write('fork #1 failed: %d (%s)\n' % + (e.errno, e.strerror)) + sys.exit(1) + os.chdir(self.home_dir) + os.setsid() + os.umask(self.umask) + try: + pid = os.fork() + if pid > 0: + sys.exit(0) + except OSError as e: + sys.stderr.write('fork #2 failed: %d (%s)\n' % + (e.errno, e.strerror)) + sys.exit(1) + sys.stdout.flush() + sys.stderr.flush() + si = open(self.stdin, 'r') + so = open(self.stdout, 'a+') + if self.stderr: + se = open(self.stderr, 'a+') + else: + se = so + os.dup2(si.fileno(), sys.stdin.fileno()) + os.dup2(so.fileno(), sys.stdout.fileno()) + os.dup2(se.fileno(), sys.stderr.fileno()) + atexit.register(self.del_pid) + pid = str(os.getpid()) + with open(self.pidfile, 'w+') as f: + f.write('%s\n' % pid) + + def start(self): + if not os.path.exists(self.logpath): + os.makedirs(self.logpath) + elif os.path.exists(self.dcufile): + os.remove(self.dcufile) + if self.verbose >= 1: + print('ready to start ......') + # check for a pid file to see if the daemon already runs + pid = self.get_pid() + if pid: + msg = 'pid file %s already exists, is it already running?\n' + sys.stderr.write(msg % self.pidfile) + sys.exit(1) + # start the daemon + self.daemonize() + self.run() + + def stop(self): + if self.verbose >= 1: + print('stopping ...') + pid = self.get_pid() + if not pid: + msg = 'pid file [%s] does not exist. Not running?\n' % self.pidfile + sys.stderr.write(msg) + if os.path.exists(self.pidfile): + os.remove(self.pidfile) + return + # try to kill the daemon process + try: + i = 0 + while 1: + os.kill(pid, signal.SIGTERM) + time.sleep(1) + i = i + 1 + if i % 10 == 0: + os.kill(pid, signal.SIGHUP) + except OSError as err: + err = str(err) + if err.find('No such process') > 0: + if os.path.exists(self.pidfile): + os.remove(self.pidfile) + else: + print(str(err)) + sys.exit(1) + if self.verbose >= 1: + print('Stopped!') + + def restart(self): + self.stop() + self.start() + + def status(self): + pid = self.get_pid() + if pid: + if os.path.exists('/proc/%d' % pid): + return pid + return False + + +def parse_args(): + ''' Check script input parameter. ''' + parse = argparse.ArgumentParser(description='Sys monitor script') + parse.add_argument('-o', + type=str, + metavar='[operation]', + required=True, + help='start|stop|restart|status') + parse.add_argument('-l', + type=str, + metavar='[log_path]', + required=False, + default='./logs/', + help='log path') + args = parse.parse_args() + return args + + +def get_system_info(): + cmd = r"echo OS version:;" + cmd = cmd + r"cat /etc/issue | head -n1 | awk '{print $1, $2, $3}';" + cmd = cmd + r"echo ;" + + cmd = cmd + r"echo OS Kernel version:;" + cmd = cmd + r"uname -r;" + cmd = cmd + r"echo ;" + + cmd = cmd + r"echo Hardware Model:;" + cmd = cmd + r"sudo dmidecode | grep -A9 'System Information' | tail -n +2 | sed 's/^[ \t]*//';" + cmd = cmd + r"echo ;" + + cmd = cmd + r"echo Accelerator Model:;" + cmd = cmd + r"rocm-smi -L;" + cmd = cmd + r"echo ;" + + cmd = cmd + r"echo Accelerator Driver version:;" + cmd = cmd + r"rocm-smi | grep 'Driver Version' | awk '{print $3}';" + cmd = cmd + r"echo ;" + + cmd = cmd + r"echo Docker version:;" + cmd = cmd + r"docker -v" + + return cmd + + +def main(): + sample_rate1 = 5 + args = parse_args() + operation = args.o + log_path = args.l + pid_fn = str('/tmp/dcu_monitor.pid') + log_fn = str(log_path + '/dcu_monitor.log') + err_fn = str(log_path + '/dcu_monitor.err') + # result for dcu + dcu_fn = str(log_path + '/dcu_monitor.log') + sys_fn = str(log_path + '/sys_info.log') + cmd = get_system_info() + with open(sys_fn, "w") as f: + p = subprocess.Popen(cmd, shell=True, stdout=f, stderr=subprocess.STDOUT) + p.wait() + + subdaemon = Daemon(pid_fn, + log_fn, + err_fn, + dcu_fn, + log_path, + verbose=1, + rate=sample_rate1) + if operation == 'start': + subdaemon.start() + elif operation == 'stop': + subdaemon.stop() + elif operation == 'restart': + subdaemon.restart() + elif operation == 'status': + pid = subdaemon.status() + if pid: + print('process [%s] is running ......' % pid) + else: + print('daemon process [%s] stopped' % pid) + else: + print("invalid argument!") + sys.exit(1) + + +if __name__ == '__main__': + main() diff --git a/training/dcu/docker_image/pytorch_1.13/Dockerfile b/training/dcu/docker_image/pytorch_1.13/Dockerfile new file mode 100644 index 000000000..47e386e7a --- /dev/null +++ b/training/dcu/docker_image/pytorch_1.13/Dockerfile @@ -0,0 +1,5 @@ +FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.13.1-centos7.6-dtk23.10-py38 +RUN source /opt/dtk/env.sh +RUN /bin/bash -c "pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple" +RUN /bin/bash -c "uname -a" +RUN /bin/bash -c alias python3=python diff --git a/training/dcu/docker_image/pytorch_1.13/pytorch1.13_install.sh b/training/dcu/docker_image/pytorch_1.13/pytorch1.13_install.sh new file mode 100644 index 000000000..237096b5c --- /dev/null +++ b/training/dcu/docker_image/pytorch_1.13/pytorch1.13_install.sh @@ -0,0 +1,10 @@ +#!/bin/bash +proxy_server_ip=10.0.35.251 +PROXY_URL="http://$proxy_server_ip:3128/" +NO_PROXY_ADDR="127.0.0.1,localhost,.local,.cluster.local,10.0.0.0/8,172.16.0.0/12,192.168.0.0/16" +export http_proxy="$PROXY_URL" +export https_proxy="$PROXY_URL" +export no_proxy="$NO_PROXY_ADDR" +export HTTP_PROXY="$PROXY_URL" +export HTTPS_PROXY="$PROXY_URL" +export NO_PROXY="$NO_PROXY_ADDR" \ No newline at end of file diff --git a/training/dcu/glm-pytorch/README.md b/training/dcu/glm-pytorch/README.md new file mode 100644 index 000000000..70246a4b5 --- /dev/null +++ b/training/dcu/glm-pytorch/README.md @@ -0,0 +1,48 @@ +### 模型Checkpoint下载 +[模型Checkpoint下载](../../benchmarks/glm/README.md#模型checkpoint) +### 测试数据集下载 +[测试数据集下载](../../benchmarks/glm/README.md#数据集) + +### Nvidia GPU配置与运行信息参考 +#### 环境配置 +- ##### 硬件环境 + - 机器型号:K100 标准机 + - 加速卡型号: K100 64G + - CPU型号: + - 多机网络类型、带宽: +- ##### 软件环境 + - OS版本:centos 7.6 + - OS kernel版本: 4.18.0-348.el8.0.2.x86_64 + - 加速卡驱动版本:dtk-23.10.1 + - Docker 版本:24.0.7 + - 训练框架版本:pytorch-1.13.1+git7d2dd01.abi0.dtk2310 + - 依赖软件版本:无 + + +### 运行情况 +* 通用指标 + +| 指标名称 | 指标值 | 特殊说明 | +| ---------------- | -------------------------------------------- | ------------------------------------------- | +| 任务类别 | 自然语言理解、无条件文本生成、有条件文本生成 | | +| 模型 | GLM | | +| 数据集 | superglue | | +| 数据精度 | precision,见“性能指标” | 可选fp32/amp/fp16 | +| 超参修改 | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 | +| 硬件设备简称 | DCU Z100L | | +| 硬件存储使用 | mem(actual/total),见“性能指标” | 通常称为“显存”,单位为GiB | +| 端到端时间 | e2e_time,见“性能指标” | 总时间+Perf初始化等时间 | +| 总吞吐量 | p_whole,见“性能指标” | 实际训练样本数除以总时间(performance_whole) | +| 训练吞吐量 | p_train,见“性能指标” | 不包含每个epoch末尾的评估部分耗时 | +| **计算吞吐量** | **p_core,见“性能指标”** | 不包含数据IO部分的耗时(p3>p2>p1) | +| **计算卡使用率** | **\*MFU** | model flops utilization | +| 训练结果 | acc,见“性能指标” | 准确率 | +| 额外修改项 | 无 | | + +* 性能指标 + +| 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | acc | mem | MFU | +| ------------------ | --------- | ------------- | -------- | ------- | ------- | ------ | ----- | --------- | ----- | +| K100单机单卡(1x1) | fp16 | bs=16, lr=1e-05 | / | / | / | / | / | 55.0/64.0 | / | +| K100单机8卡(1x8) | fp16 | bs=8, lr=1e-05 | / | / | / | / | 0.804 | 29.4/64.0 | / | +2*8因机器资源紧张,将于2024.07补充相关结果 \ No newline at end of file diff --git a/training/dcu/glm-pytorch/config/config_K100x1x1.py b/training/dcu/glm-pytorch/config/config_K100x1x1.py new file mode 100644 index 000000000..4446c704e --- /dev/null +++ b/training/dcu/glm-pytorch/config/config_K100x1x1.py @@ -0,0 +1,19 @@ +train_batch_size = 16 +eval_batch_size =16 + +max_samples_termination = 24135 + +dist_backend = "nccl" + +lr = 1e-5 +weight_decay = 0.1 +adam_beta1 = 0.9 +adam_beta2 = 0.999 +adam_eps = 1e-08 +gradient_accumulation_steps = 1 +warmup = 0.1 +lr_decay_ratio = 0.1 +lr_decay_iters = 4338 +log_freq = 1 + +training_event = None diff --git a/training/dcu/glm-pytorch/config/config_K100x1x8.py b/training/dcu/glm-pytorch/config/config_K100x1x8.py new file mode 100644 index 000000000..70311cdc8 --- /dev/null +++ b/training/dcu/glm-pytorch/config/config_K100x1x8.py @@ -0,0 +1,18 @@ +train_batch_size = 8 +eval_batch_size = 8 + +dist_backend = "nccl" + +lr = 1e-5 +weight_decay = 0.1 +adam_beta1 = 0.9 +adam_beta2 = 0.999 +adam_eps = 1e-08 +gradient_accumulation_steps = 1 +warmup = 0.1 +lr_decay_ratio = 0.1 +lr_decay_iters = 4338 +log_freq = 1 +seed = 10483 +max_samples_termination = 5553080 +training_event = None diff --git a/training/dcu/glm-pytorch/config/environment_variables.sh b/training/dcu/glm-pytorch/config/environment_variables.sh new file mode 100644 index 000000000..68fad820a --- /dev/null +++ b/training/dcu/glm-pytorch/config/environment_variables.sh @@ -0,0 +1,6 @@ +# ================================================= +# Export variables +# ================================================= + +export HSA_FORCE_FINE_GRAIN_PCIE=1 +export GPU_FLUSH_ON_EXECUTION=1 diff --git a/training/dcu/glm-pytorch/config/requirements.txt b/training/dcu/glm-pytorch/config/requirements.txt new file mode 100644 index 000000000..8599cad5e --- /dev/null +++ b/training/dcu/glm-pytorch/config/requirements.txt @@ -0,0 +1,3 @@ +h5sparse +boto3 +h5py \ No newline at end of file diff --git a/training/dcu/glm-pytorch/extern/converter.py b/training/dcu/glm-pytorch/extern/converter.py new file mode 100644 index 000000000..330bee79a --- /dev/null +++ b/training/dcu/glm-pytorch/extern/converter.py @@ -0,0 +1,21 @@ +from driver import dist_pytorch +from .layers.transformer import GLMTransformer + + +def convert_model(model, config): + if dist_pytorch.get_rank() == 0: + print("use apex layer norm", flush=True) + state_dict = model.state_dict() + transformer_layer = GLMTransformer( + num_layers=config.num_layers, + hidden_size=config.hidden_size, + num_attention_heads=config.num_attention_heads, + max_sequence_length=config.max_seq_length, + max_memory_length=config.max_memory_length, + embedding_dropout_prob=config.hidden_dropout, + attention_dropout_prob=config.attention_dropout, + output_dropout_prob=config.hidden_dropout, + checkpoint_activations=config.checkpoint_activations) + model.model.transformer = transformer_layer + model.load_state_dict(state_dict, strict=True) + return model diff --git a/training/dcu/glm-pytorch/extern/layers/__init__.py b/training/dcu/glm-pytorch/extern/layers/__init__.py new file mode 100644 index 000000000..dab9da7e4 --- /dev/null +++ b/training/dcu/glm-pytorch/extern/layers/__init__.py @@ -0,0 +1 @@ +from .transformer import * diff --git a/training/dcu/glm-pytorch/extern/layers/layernorm.py b/training/dcu/glm-pytorch/extern/layers/layernorm.py new file mode 100644 index 000000000..96a935cc6 --- /dev/null +++ b/training/dcu/glm-pytorch/extern/layers/layernorm.py @@ -0,0 +1 @@ +from apex.normalization import FusedLayerNorm as LayerNorm diff --git a/training/dcu/glm-pytorch/extern/layers/transformer.py b/training/dcu/glm-pytorch/extern/layers/transformer.py new file mode 100644 index 000000000..82e98ed26 --- /dev/null +++ b/training/dcu/glm-pytorch/extern/layers/transformer.py @@ -0,0 +1,298 @@ +import torch +import math + +from .transformer_block import GLMTransformerLayer +from .layernorm import LayerNorm +from model.models.checkpoint import checkpoint + + +def scaled_init_method(sigma, num_layers): + """Init method based on N(0, sigma/sqrt(2*num_layers).""" + std = sigma / math.sqrt(2.0 * num_layers) + + def init_(tensor): + return torch.nn.init.normal_(tensor, mean=0.0, std=std) + + return init_ + + +def unscaled_init_method(sigma): + """Init method based on N(0, sigma).""" + + def init_(tensor): + return torch.nn.init.normal_(tensor, mean=0.0, std=sigma) + + return init_ + + +class GLMTransformer(torch.nn.Module): + """GPT-2 transformer. + + This module takes input from embedding layer and it's output can + be used directly by a logit layer. It consists of L (num-layers) + blocks of: + layer norm + self attention + residual connection + layer norm + mlp + residual connection + followed by a final layer norm. + + Arguments: + num_layers: Number of transformer layers. + hidden_size: The hidden size of the self attention. + num_attention_heads: number of attention head in the self + attention. + attention_dropout_prob: dropout probability of the attention + score in self attention. + output_dropout_prob: dropout probability for the outputs + after self attention and final output. + checkpoint_activations: if True, checkpoint activations. + checkpoint_num_layers: number of layers to checkpoint. This + is basically the chunk size in checkpoitning. + layernorm_epsilon: epsilon used in layernorm to avoid + division by zero. + init_method_std: standard deviation of the init method which has + the form N(0, std). + use_scaled_init_for_output_weights: If Ture use 1/sqrt(2*num_layers) + scaling for the output weights ( + output of self attention and mlp). + """ + + def __init__( + self, + num_layers, + hidden_size, + num_attention_heads, + max_sequence_length, + max_memory_length, + embedding_dropout_prob, + attention_dropout_prob, + output_dropout_prob, + checkpoint_activations, + checkpoint_num_layers=1, + layernorm_epsilon=1.0e-5, + init_method_std=0.02, + use_scaled_init_for_output_weights=True, + block_position_encoding=True, + attention_scale=1.0, + ): + super(GLMTransformer, self).__init__() + self.hidden_size = hidden_size + # Store activation checkpoiting flag. + self.checkpoint_activations = checkpoint_activations + self.checkpoint_num_layers = checkpoint_num_layers + self.max_memory_length = max_memory_length + + output_layer_init_method = None + if use_scaled_init_for_output_weights: + output_layer_init_method = scaled_init_method( + init_method_std, num_layers) + # Embeddings dropout + self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob) + self.block_position_encoding = block_position_encoding + + # Position embedding (serial). + if block_position_encoding: + self.position_embeddings = torch.nn.Embedding( + max_sequence_length + 1, hidden_size) + self.block_position_embeddings = torch.nn.Embedding( + max_sequence_length + 1, hidden_size) + torch.nn.init.normal_(self.block_position_embeddings.weight, + mean=0.0, + std=init_method_std) + else: + self.position_embeddings = torch.nn.Embedding( + max_sequence_length, hidden_size) + # Initialize the position embeddings. + torch.nn.init.normal_(self.position_embeddings.weight, + mean=0.0, + std=init_method_std) + + def get_layer(): + + return GLMTransformerLayer( + hidden_size, + num_attention_heads, + attention_dropout_prob, + output_dropout_prob, + layernorm_epsilon, + unscaled_init_method(init_method_std), + output_layer_init_method=output_layer_init_method, + relative_encoding=False, + performer=False, + attention_scale=attention_scale) + + # Transformer layers. + self.layers = torch.nn.ModuleList( + [get_layer() for _ in range(num_layers)]) + + # Final layer norm before output. + self.final_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon) + + # if deepspeed.checkpointing.is_configured(): + # global get_cuda_rng_tracker, checkpoint + # get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker + # checkpoint = deepspeed.checkpointing.checkpoint + + def forward(self, + hidden_states, + position_ids, + attention_mask, + memory_states=None, + encoder_states=None, + return_memory=False, + detach_memory=True): + batch_size, query_length = hidden_states.size()[:2] + memory_length = memory_states[0].size(1) if memory_states else 0 + key_length = query_length + memory_length + # attention mask is the beginning postion of B region, \in [0, query_len) + is_scalar = torch.numel(attention_mask) == 1 + is_sep = is_scalar or torch.numel(attention_mask) == batch_size + + if is_sep: + sep = attention_mask.item() if is_scalar else attention_mask + + # conventional transformer + def build_mask_matrix(seq_length, sep, memory_length=0): + m = hidden_states.new_ones((1, seq_length, seq_length)) + m = torch.tril(m) + if is_scalar: + m[0, :, :sep] = 1 + else: + m = m.expand(batch_size, -1, -1) + ids = torch.arange(seq_length, + device=sep.device, + dtype=sep.dtype).view(1, -1) + mask = ids < sep.view(-1, 1) + m = m.masked_fill(mask.unsqueeze(1).expand_as(m), 1) + if memory_length > 0: + m = m.expand(batch_size, -1, -1) + m = torch.cat((hidden_states.new_ones( + (batch_size, seq_length, memory_length)), m), + dim=2) + m = m.unsqueeze(1) + return m + + attention_mask = build_mask_matrix(query_length, + sep, + memory_length=memory_length) + else: + attention_mask = attention_mask[:, :, :, + -query_length - memory_length:] + + if self.block_position_encoding: + position_ids, block_position_ids = position_ids[:, + 0], position_ids[:, + 1] + position_embeddings = self.position_embeddings(position_ids) + hidden_states = hidden_states + position_embeddings + if self.block_position_encoding: + block_position_embeddings = self.block_position_embeddings( + block_position_ids) + hidden_states = hidden_states + block_position_embeddings + hidden_states = self.embedding_dropout(hidden_states) + + def check_detach(_hidden_states): + if detach_memory: + return _hidden_states.detach() + return _hidden_states + + if self.max_memory_length > 0 or return_memory: + mem_layers = [check_detach(hidden_states)] + else: + mem_layers = [] + + def custom(start, end): + + def custom_forward(*inputs): + layers_ = self.layers[start:end] + x_, inputs = inputs[0], inputs[1:] + + inputs, mems_ = inputs[:1], inputs[1:] + for i, layer in enumerate(layers_): + mem_i_ = mems_[i] if mems_ else None + x_ = layer(x_, *inputs, mem=mem_i_) + if self.max_memory_length > 0 or return_memory: + mem_layers.append(check_detach(x_)) + return x_ + + return custom_forward + + if self.checkpoint_activations: + l = 0 + num_layers = len(self.layers) + chunk_length = self.checkpoint_num_layers + while l < num_layers: + args = [hidden_states, attention_mask] + if memory_states: + args += memory_states[l:l + chunk_length] + hidden_states = checkpoint(custom(l, l + chunk_length), *args) + l += chunk_length + else: + for i, layer in enumerate(self.layers): + args = [hidden_states, attention_mask] + mem_i = memory_states[i] if memory_states else None + hidden_states = layer(*args, mem=mem_i) + if self.max_memory_length > 0 or return_memory: + mem_layers.append(check_detach(hidden_states)) + + # Final layer norm. + output = self.final_layernorm(hidden_states) + if self.max_memory_length > 0 or return_memory: + mem_layers = self.update_mems(mem_layers, + memory_states, + return_memory=return_memory) + + return (output, mem_layers) + + def update_mems(self, hiddens, mems, return_memory=False): + memory_length = mems[0].size(1) if mems else 0 + query_length = hiddens[0].size(1) + new_memory_length = memory_length + query_length + if not return_memory: + new_memory_length = min(self.max_memory_length, new_memory_length) + new_mems = [] + # with torch.no_grad(): + for i in range(len(hiddens)): + if new_memory_length <= query_length: + new_mems.append(hiddens[i][:, -new_memory_length:]) + else: + new_mems.append( + torch.cat((mems[i][:, -new_memory_length + query_length:], + hiddens[i]), + dim=1)) + return new_mems + + +if __name__ == "__main__": + + batch_size = 2 + seq_len = 512 + hidden_size = 1024 + hidden_states = torch.rand([batch_size, seq_len, hidden_size], + dtype=torch.float32).to("cuda") + position_ids = torch.ones([batch_size, 2, seq_len], + dtype=torch.int64).to('cuda') + attention_mask = torch.tensor([5, 10]).to('cuda') + + model = GLMTransformer(num_layers=24, + hidden_size=1024, + num_attention_heads=16, + max_sequence_length=512, + max_memory_length=0, + embedding_dropout_prob=0.1, + attention_dropout_prob=0.1, + output_dropout_prob=0.1, + checkpoint_activations=True, + checkpoint_num_layers=1, + layernorm_epsilon=1.0e-5, + init_method_std=0.02, + use_scaled_init_for_output_weights=True, + block_position_encoding=True, + attention_scale=1.0).to('cuda') + + outputs = model(hidden_states, position_ids, attention_mask) + print(outputs[0].shape) + print(outputs[1]) diff --git a/training/dcu/glm-pytorch/extern/layers/transformer_block.py b/training/dcu/glm-pytorch/extern/layers/transformer_block.py new file mode 100644 index 000000000..c3d33466b --- /dev/null +++ b/training/dcu/glm-pytorch/extern/layers/transformer_block.py @@ -0,0 +1,125 @@ +import torch + +from model.layers.attention import SelfAttention +from .layernorm import LayerNorm +from model.layers.mlp import GLMMLP + + +class GLMTransformerLayer(torch.nn.Module): + """A single layer transformer for GPT2. + + We use the following notation: + h: hidden size + n: number of attention heads + b: batch size + s: sequence length + Transformore layer takes input with size [b, s, h] and returns an + output of the same size. + + Arguments: + hidden_size: The hidden size of the self attention. + num_attention_heads: number of attention head in the self + attention. + attention_dropout_prob: dropout probability of the attention + score in self attention. + output_dropout_prob: dropout probability for the outputs + after self attention and final output. + layernorm_epsilon: epsilon used in layernorm to avoid + division by zero. + init_method: initialization method used for the weights. Note + that all biases are initialized to zero and + layernorm weight are initialized to one. + output_layer_init_method: output layers (attention output and + mlp output) initialization. If None, + use `init_method`. + """ + + def __init__(self, + hidden_size, + num_attention_heads, + attention_dropout_prob, + output_dropout_prob, + layernorm_epsilon, + init_method, + output_layer_init_method=None, + relative_encoding=False, + performer=False, + attention_scale=1.0): + super(GLMTransformerLayer, self).__init__() + # Set output layer initialization if not provided. + if output_layer_init_method is None: + output_layer_init_method = init_method + + # Layernorm on the input data. + self.input_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon) + + # Self attention. + self.attention = SelfAttention( + hidden_size, + num_attention_heads, + attention_dropout_prob, + output_dropout_prob, + init_method, + output_layer_init_method=output_layer_init_method, + relative_encoding=relative_encoding, + performer=performer, + attention_scale=attention_scale) + + # Layernorm on the input data. + self.post_attention_layernorm = LayerNorm(hidden_size, + eps=layernorm_epsilon) + + # MLP + self.mlp = GLMMLP(hidden_size, + output_dropout_prob, + init_method, + output_layer_init_method=output_layer_init_method) + + def forward(self, + hidden_states, + ltor_mask, + position_embeddings=None, + r_w_bias=None, + r_r_bias=None, + mem=None): + # hidden_states: [b, s, h] + # ltor_mask: [1, 1, s, s] + + # Layer norm at the begining of the transformer layer. + layernorm_output = self.input_layernorm(hidden_states) + mem = self.input_layernorm(mem) if mem is not None else None + # Self attention. + attention_output = self.attention(layernorm_output, ltor_mask, + position_embeddings, r_w_bias, + r_r_bias, mem) + # Residual connection. + layernorm_input = hidden_states + attention_output + # Layer norm post the self attention. + layernorm_output = self.post_attention_layernorm(layernorm_input) + # MLP. + mlp_output = self.mlp(layernorm_output) + # Second residual connection. + output = layernorm_input + mlp_output + + return output + + +if __name__ == "__main__": + batch_size = 8 + seq_len = 512 + hidden_size = 1024 + num_attention_heads = 16 + attention_dropout_prob = 0.1 + output_dropout_prob = 0.1 + layernorm_epsilon = 1e-10 + init_method = torch.nn.init.xavier_normal_ + test_transformer = GLMTransformerLayer(hidden_size, num_attention_heads, + attention_dropout_prob, + output_dropout_prob, + layernorm_epsilon, init_method) + + hidden_states = torch.rand([batch_size, seq_len, hidden_size]) + ltor_mask = torch.ones([1, 1, seq_len, seq_len]) + + outputs = test_transformer(hidden_states, ltor_mask) + print(outputs.shape) diff --git a/training/dcu/glm-pytorch/extern/trainer_adapter.py b/training/dcu/glm-pytorch/extern/trainer_adapter.py new file mode 100644 index 000000000..d4c3837a5 --- /dev/null +++ b/training/dcu/glm-pytorch/extern/trainer_adapter.py @@ -0,0 +1,80 @@ +import torch +import config + +from torch import nn + +from .converter import convert_model as _convert_model +from driver.dist_pytorch import main_proc_print +from typing import Tuple +from model.models.modeling import FP16_Module +from driver.dist_pytorch import PyTorchDistributedDataParallel as TorchDDP + +from optimizers.loss_scaler import DynamicLossScaler + +clip_grad_norm = torch.nn.utils.clip_grad_norm_ + + +def convert_model(model: torch.nn.Module) -> torch.nn.Module: + return _convert_model(model, config) + + +def model_to_fp16(model): + # To prevent OOM for model sizes that cannot fit in GPU memory in full precision + if config.fp16: + main_proc_print(" > use fp16...") + model.half() + + # GPU allocation. + model.cuda(torch.cuda.current_device()) + + # Fp16 conversion. + if config.fp16: + model = FP16_Module(model) + return model + + +def model_to_ddp(model: nn.Module) -> nn.Module: + i = torch.cuda.current_device() + if torch.distributed.is_available() and torch.distributed.is_initialized(): + model = TorchDDP(model, device_ids=[i], output_device=i) + return model + + +def backward(step, lm_loss, reduced_loss, optimizer, lr_scheduler, model): + args = config + + if not DynamicLossScaler._has_inf_or_nan(reduced_loss): + backward_step(optimizer, model, lm_loss, args) + if step % args.gradient_accumulation_steps == 0: + optimizer.step() + if not (args.fp16 and optimizer.overflow): + lr_scheduler.step() + optimizer.zero_grad() + + else: + main_proc_print("Found NaN loss, skip backward") + return reduced_loss + + +def backward_step(optimizer, model, lm_loss, args): + """Backward step.""" + + # Total loss. + loss = lm_loss + + if args.fp16: + optimizer.backward(loss, update_master_grads=False) + else: + loss.backward() + + if args.fp16: + optimizer.update_master_grads() + + # Clipping gradients helps prevent the exploding gradient. + if args.clip_grad > 0: + if not args.fp16: + clip_grad_norm(model.parameters(), args.clip_grad) + else: + optimizer.clip_master_grads(args.clip_grad) + + return lm_loss diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py index 41397ffbc..ae5778abc 100644 --- a/training/run_benchmarks/config/test_conf.py +++ b/training/run_benchmarks/config/test_conf.py @@ -1,7 +1,7 @@ '''Test Configs, including''' # -*-coding:utf-8 -*- -# Set accelerator's vendor name, e.g. iluvatar, cambricon, kunlunxin, ascend, mthreads and metax. +# Set accelerator's vendor name, e.g. iluvatar, cambricon, kunlunxin, ascend, mthreads, metax and dcu. # We will run benchmarks in training/ VENDOR = "nvidia" @@ -23,6 +23,8 @@ # " --env MTHREADS_VISIBLE_DEVICES=all" # metax: # " --device=/dev/dri --device=/dev/mxcd --group-add video" +# dcu: +# "-v /opt/hyhal/:/opt/hyhal/ --device=/dev/kfd --device=/dev/dri/ --group-add video" ACCE_CONTAINER_OPT = " --gpus all" # XXX_VISIBLE_DEVICE item name in env # possible value of ACCE_VISIBLE_DEVICE_ENV_NAME are: @@ -31,6 +33,7 @@ # XPU_VISIBLE_DEVICES for kunlunxin # ASCEND_VISIBLE_DEVICES for ascend # MUSA_VISIBLE_DEVICES for mthreads +# HIP_VISIBLE_DEVICES for dcu ACCE_VISIBLE_DEVICE_ENV_NAME = "CUDA_VISIBLE_DEVICES" # Set pip source, which will be used in preparing envs in container @@ -79,14 +82,14 @@ # "wav2vec2:pytorch_1.13:A100:1:8:1": "/raid/dataset/wav2vec2_data/LibriSpeech", # "WaveGlow:pytorch_1.13:A100:1:8:1": "/raid/dataset/LJSpeech/", - "resnet50:tensorflow2:A100:1:8:1": "/raid/dataset/ImageNet2012/tf_records/", + # "resnet50:tensorflow2:A100:1:8:1": "/raid/dataset/ImageNet2012/tf_records/", # "distilbert:pytorch_1.12:A100:1:8:1": "/raid/dataset/distilbert/", # "transformer:pytorch_1.13:A100:1:8:1": "/raid/dataset/transformer/wmt14_en_de_joined_dict", # "swin_transformer:pytorch_1.8:A100:1:8:1": "/raid/dataset/ImageNet_1k_2012/", # "transformer_xl:pytorch_1.8:A100:1:8:1": "/raid/dataset/transformer_xl/", - "t5_small:pytorch_1.12:A100:1:8:1": "/raid/dataset/t5_small_train", + # "t5_small:pytorch_1.12:A100:1:8:1": "/raid/dataset/t5_small_train", # "gpt2:pytorch_1.12:A100:1:8:1": "/raid/dataset/gpt2", # "bert_hf:pytorch_1.13:A100:1:8:1": "/raid/dataset/bert_hf_train", @@ -160,5 +163,9 @@ # "mobilenetv2:pytorch_2.0:C500:1:8:1": "/raid/dataset/ImageNet_1k_2012/", # "mask_rcnn:pytorch_2.0:C500:1:8:1": "/raid/dataset/coco2017/", # "detr:pytorch_2.0:C500:1:8:1": "/raid/dataset/coco2017/", + + # dcu cases + # "glm:pytorch_1.13:K100:1:8:1": "/home/chenych/datasets/glm_train_datset/", + } From 077321adb62d641b39d879ac37a20706dafc13f6 Mon Sep 17 00:00:00 2001 From: jsnoc <61768944+jsnoc@users.noreply.github.com> Date: Tue, 5 Mar 2024 19:23:40 +0800 Subject: [PATCH 3/4] add resnet infer metax (#474) Co-authored-by: yaguang.wuyaguang --- inference/benchmarks/resnet50/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/inference/benchmarks/resnet50/README.md b/inference/benchmarks/resnet50/README.md index a42cc9948..c689f6a17 100644 --- a/inference/benchmarks/resnet50/README.md +++ b/inference/benchmarks/resnet50/README.md @@ -142,4 +142,5 @@ find ./val -name "*JPEG" | wc -l | kunlunxin_xtcl | fp32 | 128 | / | / | / | / | / | / | 76.2/76.2 | 4.52/32.0 | | kunlunxin_xtcl | fp16 | 256 | / | / | / | / | / | / | 76.2/76.2 | 4.52/32.0 | | zixiao | fp16 | 32*6 | 261.103 | / | / | 193.151 | 6342.191 | / | 76.2/76.2 | / | - +| metax-nocompiler | fp16 | 256 |/ | / | / | / | / | 7.8% | 76.2/76.2 | 3.83/64.0 | +| metax-nocompiler | fp32 | 256 | / | / | / | / | / | 7.7% | 76.2/76.2 | 5.46/64.0 | From a67831d4ca9a5ac2fa8df3bb8fe0e320dde9fa2d Mon Sep 17 00:00:00 2001 From: Kathrine Date: Wed, 6 Mar 2024 13:49:02 +0800 Subject: [PATCH 4/4] [metax] add bert_large inference result (#476) * add bert_hf result * Update README.md 1 * add glm result * [metax] Update glm README.md * update metax bertlarge inference result * update metax bert_large inference result * Update README.md --- inference/benchmarks/bertLarge/README.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/inference/benchmarks/bertLarge/README.md b/inference/benchmarks/bertLarge/README.md index d7eeb4aeb..e627f7a7b 100644 --- a/inference/benchmarks/bertLarge/README.md +++ b/inference/benchmarks/bertLarge/README.md @@ -77,6 +77,19 @@ bert_reference_results_text_md5.txt - IXRT: ixrt-0.8.0+corex.3.2.1 +#### 2.4 沐曦集成电路 C500 + +- ##### 硬件环境 + - 机器、加速卡型号: 曦云®C500 64G +- ##### 软件环境 + - OS版本:Ubuntu 20.04.6 + - OS kernel版本: 5.4.0-26-generic + - 加速卡驱动版本:2.2.0 + - Docker 版本:24.0.7 + - 推理框架版本:pytorch-2.0.0+mc2.18.0.8-cp38-cp38-linux_x86_64.whl + + + ### 4. 运行情况(BERT-Large) * 指标列表 @@ -103,4 +116,6 @@ bert_reference_results_text_md5.txt | tensorrt | fp32 | 32 | 1868.8 | 150.4 | 152.2 | 190.4 | 194.1 | 42.0% | 0.638/0.638 | 16.9/40.0 | | kunlunxin_xtcl| W32A16 | 32 |/ | / | / | / | / | / | 0.638/0.638| /| | iluvatar_ixrt| fp16 | 32 |/ | / | / | / | / | / | 0.599/0.638| /| +| metax-nocompiler| fp16 | 32 |/ | / | / | / | / | 27.6% | 0.638/0.638| 4.3/64.0| +| metax-nocompiler| fp32 | 32 |/ | / | / | / | / | 28.1% | 0.639/0.638| 6.1/64.0|