diff --git a/inference/benchmarks/stable_diffusion_v1_4/README.md b/inference/benchmarks/stable_diffusion_v1_4/README.md index 0df0c5920..c73c77b4a 100644 --- a/inference/benchmarks/stable_diffusion_v1_4/README.md +++ b/inference/benchmarks/stable_diffusion_v1_4/README.md @@ -59,3 +59,5 @@ | kunlunxin_xtcl | fp32 | 2 | / | / | / | / | / | / | 26.524/25.3 | 0.07/32.0 | | null | fp16 | 16 | / | 11.7 | 60.7 | / | / | 13.2% | -/25.2 | 5.7/40.0 | | null | fp32 | 8 | / | 9.3 | 27.3 | / | / | 11.9% | -/25.3 | 6.3/40.0 | +| metax-nocompiler | fp16 | 64 | / | / | / | / | / | 12.7% | -/25.4 | 14.7/64.0 | +| metax-nocompiler | fp32 | 16 | / | / | / | / | / | 10.3% | -/25.4 | 55.57/64.0 | diff --git a/inference/configs/host.yaml b/inference/configs/host.yaml index 7ad020c8e..191867429 100644 --- a/inference/configs/host.yaml +++ b/inference/configs/host.yaml @@ -8,10 +8,13 @@ SSH_PORT: "22" HOSTS_PORTS: ["2222"] MASTER_PORT: "29501" SHM_SIZE: "32G" +# metax: +# " --device=/dev/dri --device=/dev/mxcd --group-add video" ACCE_CONTAINER_OPT: " --gpus all" PIP_SOURCE: "https://mirror.baidu.com/pypi/simple" CLEAR_CACHES: True ACCE_VISIBLE_DEVICE_ENV_NAME: "CUDA_VISIBLE_DEVICES" CASES: # "resnet50:pytorch_1.13": "/raid/dataset/ImageNet/imagenet/val" - "vit_l_16:pytorch_2.1": "/raid/dataset/ImageNet_1k_2012/val" \ No newline at end of file + "vit_l_16:pytorch_2.1": "/raid/dataset/ImageNet_1k_2012/val" + #"stable_diffusion_v1_4:pytorch_2.0": "/raid/dataset/stable_diffusion_v1_4/" \ No newline at end of file diff --git a/inference/docker_images/metax/metax_analysis.py b/inference/docker_images/metax/metax_analysis.py new file mode 100644 index 000000000..ce863b57b --- /dev/null +++ b/inference/docker_images/metax/metax_analysis.py @@ -0,0 +1,17 @@ +def analysis_log(logpath): + logfile = open(logpath) + + max_usage = 0.0 + max_mem = 0.0 + for line in logfile.readlines(): + if "MiB" in line: + + usage_and_maxusage = line.split(" ")[2] + usage = float(usage_and_maxusage.split("/")[0]) + max_usage = max(max_usage, usage) + max_mem = float(usage_and_maxusage.split("/")[1]) + #max_mem = float(max_mem[:-3]) + print (max_mem) + print (max_usage) + return round(max_usage / 1024.0, + 2), round(max_mem / 1024.0, 2), eval("120e12"), eval("240e12") \ No newline at end of file diff --git a/inference/docker_images/metax/metax_monitor.py b/inference/docker_images/metax/metax_monitor.py new file mode 100644 index 000000000..20c69e9cf --- /dev/null +++ b/inference/docker_images/metax/metax_monitor.py @@ -0,0 +1,256 @@ +# !/usr/bin/env python3 +# encoding: utf-8 +''' +Usage: python3 sys-monitor.py -o operation -l [log_path] + -o, --operation start|stop|restart|status + -l, --log log path , ./logs/ default +''' + +import os +import sys +import time +import signal +import atexit +import argparse +import datetime +from multiprocessing import Process +import subprocess +import schedule + + +class Daemon: + ''' + daemon subprocess class. + usage: subclass this daemon and override the run() method. + sys-monitor.pid: in the /tmp/, auto del when unexpected exit. + verbose: debug mode, disabled default. + ''' + + def __init__(self, + pid_file, + log_file, + err_file, + gpu_log, + log_path, + rate=5, + stdin=os.devnull, + stdout=os.devnull, + stderr=os.devnull, + home_dir='.', + umask=0o22, + verbose=0): + self.stdin = stdin + self.stdout = stdout + self.stderr = stderr + self.home_dir = home_dir + self.verbose = verbose + self.pidfile = pid_file + self.logfile = log_file + self.errfile = err_file + self.gpufile = gpu_log + self.logpath = log_path + self.rate = rate + self.umask = umask + self.verbose = verbose + self.daemon_alive = True + + def get_pid(self): + try: + with open(self.pidfile, 'r') as pf: + pid = int(pf.read().strip()) + except IOError: + pid = None + except SystemExit: + pid = None + return pid + + def del_pid(self): + if os.path.exists(self.pidfile): + os.remove(self.pidfile) + + def run(self): + ''' + NOTE: override the method in subclass + ''' + + def gpu_mon(file): + TIMESTAMP = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S') + cmd = "mx-smi |grep 'W' -m 1 | awk '{print $2, $3, $5,$6}' && mx-smi |grep 'MXC' -m 1 | awk '{print $7}'" + process = subprocess.Popen(cmd, + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + encoding='utf-8') + try: + out = process.communicate(timeout=10) + except subprocess.TimeoutExpired: + process.kill() + out = process.communicate() + + if process.returncode != 0: + result = "error" + result = TIMESTAMP + "\n" + out[0] + "\n" + with open(file, 'a') as f: + f.write(result) + + def timer_gpu_mon(): + gpu_process = Process(target=gpu_mon, args=(self.gpufile, )) + gpu_process.start() + + schedule.every(self.rate).seconds.do(timer_gpu_mon) + while True: + schedule.run_pending() + time.sleep(5) + + def daemonize(self): + if self.verbose >= 1: + print('daemon process starting ...') + try: + pid = os.fork() + if pid > 0: + sys.exit(0) + except OSError as e: + sys.stderr.write('fork #1 failed: %d (%s)\n' % + (e.errno, e.strerror)) + sys.exit(1) + os.chdir(self.home_dir) + os.setsid() + os.umask(self.umask) + try: + pid = os.fork() + if pid > 0: + sys.exit(0) + except OSError as e: + sys.stderr.write('fork #2 failed: %d (%s)\n' % + (e.errno, e.strerror)) + sys.exit(1) + sys.stdout.flush() + sys.stderr.flush() + si = open(self.stdin, 'r') + so = open(self.stdout, 'a+') + if self.stderr: + se = open(self.stderr, 'a+') + else: + se = so + os.dup2(si.fileno(), sys.stdin.fileno()) + os.dup2(so.fileno(), sys.stdout.fileno()) + os.dup2(se.fileno(), sys.stderr.fileno()) + atexit.register(self.del_pid) + pid = str(os.getpid()) + with open(self.pidfile, 'w+') as f: + f.write('%s\n' % pid) + + def start(self): + if not os.path.exists(self.logpath): + os.makedirs(self.logpath) + elif os.path.exists(self.gpufile): + os.remove(self.gpufile) + if self.verbose >= 1: + print('ready to start ......') + # check for a pid file to see if the daemon already runs + pid = self.get_pid() + if pid: + msg = 'pid file %s already exists, is it already running?\n' + sys.stderr.write(msg % self.pidfile) + sys.exit(1) + # start the daemon + self.daemonize() + self.run() + + def stop(self): + if self.verbose >= 1: + print('stopping ...') + pid = self.get_pid() + if not pid: + msg = 'pid file [%s] does not exist. Not running?\n' % self.pidfile + sys.stderr.write(msg) + if os.path.exists(self.pidfile): + os.remove(self.pidfile) + return + # try to kill the daemon process + try: + i = 0 + while 1: + os.kill(pid, signal.SIGTERM) + time.sleep(1) + i = i + 1 + if i % 10 == 0: + os.kill(pid, signal.SIGHUP) + except OSError as err: + err = str(err) + if err.find('No such process') > 0: + if os.path.exists(self.pidfile): + os.remove(self.pidfile) + else: + print(str(err)) + sys.exit(1) + if self.verbose >= 1: + print('Stopped!') + + def restart(self): + self.stop() + self.start() + + def status(self): + pid = self.get_pid() + if pid: + if os.path.exists('/proc/%d' % pid): + return pid + return False + + +def parse_args(): + ''' Check script input parameter. ''' + parse = argparse.ArgumentParser(description='Sys monitor script') + parse.add_argument('-o', + type=str, + metavar='[operation]', + required=True, + help='start|stop|restart|status') + parse.add_argument('-l', + type=str, + metavar='[log_path]', + required=False, + default='./logs/', + help='log path') + args = parse.parse_args() + return args + + +def main(): + sample_rate1 = 5 + args = parse_args() + operation = args.o + log_path = args.l + pid_fn = str('/tmp/gpu_monitor.pid') + log_fn = str(log_path + '/metax_monitor.log') + err_fn = str(log_path + '/metax_monitor.err') + # result for gpu + gpu_fn = str(log_path + '/metax_monitor.log') + + subdaemon = Daemon(pid_fn, + log_fn, + err_fn, + gpu_fn, + log_path, + verbose=1, + rate=sample_rate1) + if operation == 'start': + subdaemon.start() + elif operation == 'stop': + subdaemon.stop() + elif operation == 'restart': + subdaemon.restart() + elif operation == 'status': + pid = subdaemon.status() + if pid: + print('process [%s] is running ......' % pid) + else: + print('daemon process [%s] stopped' % pid) + else: + print("invalid argument!") + sys.exit(1) + + +if __name__ == '__main__': + main() diff --git a/inference/docker_images/metax/pytorch_2.0/Dockerfile b/inference/docker_images/metax/pytorch_2.0/Dockerfile new file mode 100644 index 000000000..0a813966e --- /dev/null +++ b/inference/docker_images/metax/pytorch_2.0/Dockerfile @@ -0,0 +1,11 @@ +FROM mxcr.io/library/maca-c500-pytorch:2.19.2.5-ubuntu18.04-amd64 +ENV PATH="/opt/conda/bin:${PATH}" +ENV PYTORCH_USE_FLASHATTN=1 +RUN pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple +RUN pip install numpy +RUN pip install pyyaml +RUN pip install timm +RUN pip install munch +RUN pip install schedule +RUN pip install loguru +RUN /bin/bash -c "uname -a" \ No newline at end of file diff --git a/training/benchmarks/llama2_70B/megatron/megatron_main.sh b/training/benchmarks/llama2_70B/megatron/megatron_main.sh index 812d57ae4..390588bc2 100644 --- a/training/benchmarks/llama2_70B/megatron/megatron_main.sh +++ b/training/benchmarks/llama2_70B/megatron/megatron_main.sh @@ -131,8 +131,10 @@ LOGGING_ARGS=" --log-interval 1 " +CODE_PATH="/workspace/FlagScale/pretrain_llama.py" + source $VENDOR_SHELL -cmd="torchrun $DISTRIBUTED_ARGS /workspace/FlagScale/pretrain_llama.py \ +cmd="torchrun $DISTRIBUTED_ARGS $CODE_PATH \ $TRAINING_ARGS \ $MIXED_PRECISION_ARGS \ $DATA_ARGS \ diff --git a/training/kunlunxin/docker_image/megatron/Dockerfile b/training/kunlunxin/docker_image/megatron/Dockerfile new file mode 100644 index 000000000..d02bf6b01 --- /dev/null +++ b/training/kunlunxin/docker_image/megatron/Dockerfile @@ -0,0 +1,6 @@ +FROM iregistry.baidu-int.com/xmlir/xmlir_ubuntu_2004_x86_64:v0.27 +RUN /bin/bash -c "pip config set global.index-url https://mirror.baidu.com/pypi/simple" +RUN /bin/bash -c "uname -a" +RUN /bin/bash -c alias python3=python + +ENV PATH /root/miniconda/envs/python38_torch201_cuda/bin:$PATH diff --git a/training/kunlunxin/docker_image/megatron/megatron_install.sh b/training/kunlunxin/docker_image/megatron/megatron_install.sh new file mode 100644 index 000000000..e79533ee3 --- /dev/null +++ b/training/kunlunxin/docker_image/megatron/megatron_install.sh @@ -0,0 +1,14 @@ +#!/bin/bash +# using github mirrors to avoid github TTL +#export https_proxy=http://10.1.0.34:7890 +git clone https://githubfast.com/FlagOpen/FlagScale +cd FlagScale + +git checkout eb0438a5459404e2e4c70b15fa37e9a197ab159d +echo 'export PYTHONPATH=$PYTHONPATH:/home/FlagScale' >> /root/.bashrc +source /root/.bashrc + +wget https://bd.bcebos.com/v1/klx-pytorch-work-bd/training/zhangling21_llama70B/xmlir201_5.run +bash xmlir201_5.run +XFLAGS --enable transformer_engine +XFLAGS --enable flagscale \ No newline at end of file diff --git a/training/kunlunxin/llama2_70B-megatron/README.md b/training/kunlunxin/llama2_70B-megatron/README.md new file mode 100644 index 000000000..ec69c5245 --- /dev/null +++ b/training/kunlunxin/llama2_70B-megatron/README.md @@ -0,0 +1,49 @@ +### 昆仑芯XPU配置与运行信息参考 +#### 环境配置 +- ##### 硬件环境 + - 机器型号: 昆仑芯AI加速器组R480-X8 + - 加速卡型号: 昆仑芯AI加速卡R300 + - 多机网络类型、带宽: InfiniBand,200Gb/s + +- ##### 软件环境 + - OS版本:Ubuntu 20.04 + - OS kernel版本: 5.4.0-26-generic + - 加速卡驱动版本:4.0.25 + - Docker镜像和版本:iregistry.baidu-int.com/xmlir/xmlir_ubuntu_2004_x86_64:v0.27 + - 训练框架版本:xmlir + - 训练编译器版本:xacc + - 依赖软件版本:pytorch-2.0.1 + + +### 运行情况 + +* 输入批尺寸 + 1. local_batchsize(micro_batchsize),简写为LBS,即实际进入模型的张量批尺寸,为config_H100x4x8.py中所写,在本case中默认为1 + 2. seqlength(max_position_embedding),简写为MPE,即实际进入模型的序列长度,为config_H100x4x8.py中所写,在本case中默认为4096 + 3. gradient_accumulate_steps,简写为GAS,即梯度累加步数,为ds_config.json中所写,在本case中默认为44 + 4. global_batchsize恒等于local_batchsize\*gradient_accumulate_steps\*data_parallel_size。在本case中,data_parallel_size=world_size/TPsize/PPsize。 + +* 通用指标 + +| 指标名称 | 指标值 | 特殊说明 | +| ------------ | -------------------------- | ---------------------------------- | +| 任务类别 | 自然语言理解 | | +| 模型 | llama2_70b | | +| 数据集 | pile wikipedia | | +| 数据精度 | precision,见“性能指标” | 可选fp32/amp/fp16/bf16 | +| 超参修改 | parallel,见“性能指标” | 格式为TPxPPyDPz,例如TP2PP1DP4 | +| 超参修改 | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 | +| 硬件设备简称 | nvidia H800 | | +| 硬件存储使用 | mem,见“性能指标” | 通常称为“显存”,单位为GiB | +| 计算使用率 | MFU,见“性能指标” | 参见PaLM论文定义 | +| **吞吐量** | **token/p/s,见“性能指标”** | 平均单卡每秒处理的token数 | + +* 性能指标 + +值得注意的是,下列第4组实验的global_batchsize与llama2原始论文相同, 训练100 step,此项实验也将作为精度对齐所用实验。 + +| 配置 | precision | parallel | fix_hp | token/p/s | 是否精度对齐 | mem | MFU | +| ------------------- | --------- | --------- | ---------------------------- | --------- | ----- | ----- | --- | +| R300十机80卡(10x8) | fp32 | TP8PP10DP1 | / | / | / | 21/32 | / | +| R300十机80卡(10x8) | amp | TP8PP10DP1 | GAS=1024(GBS=1024=4M tokens) | / | doing* | 21/32 | / | +因缺少R300机器,在单卡R300与单卡GPU上初步验证精度。目前已通过减小模型层数的方式,在单卡R300与单卡GPU上验证精度。完整70B模型的精度验证进行中。 diff --git a/training/kunlunxin/llama2_70B-megatron/config/config_R300x10x8.py b/training/kunlunxin/llama2_70B-megatron/config/config_R300x10x8.py new file mode 100644 index 000000000..c56990d52 --- /dev/null +++ b/training/kunlunxin/llama2_70B-megatron/config/config_R300x10x8.py @@ -0,0 +1,10 @@ +seqlength = 4096 +batchsize = 1 +accumulate_steps = 44 +train_tokens = 100000000 +theoryflops = 256000000000000.0 +epochs = 1 +flashattn = False +recompute = False +tensor_parallel = 8 +pipeline_parallel = 10 diff --git a/training/kunlunxin/llama2_70B-megatron/config/environment_variables.sh b/training/kunlunxin/llama2_70B-megatron/config/environment_variables.sh new file mode 100644 index 000000000..90a1a4e5e --- /dev/null +++ b/training/kunlunxin/llama2_70B-megatron/config/environment_variables.sh @@ -0,0 +1 @@ +export PATH=/root/miniconda/envs/python38_torch201_cuda/bin:$PATH \ No newline at end of file diff --git a/training/kunlunxin/llama2_70B-megatron/config/requirements.txt b/training/kunlunxin/llama2_70B-megatron/config/requirements.txt new file mode 100644 index 000000000..ad213956e --- /dev/null +++ b/training/kunlunxin/llama2_70B-megatron/config/requirements.txt @@ -0,0 +1 @@ +sentencepiece \ No newline at end of file diff --git a/training/kunlunxin/llama2_70B-megatron/config/training_adapter.sh b/training/kunlunxin/llama2_70B-megatron/config/training_adapter.sh new file mode 100644 index 000000000..fcc2210d5 --- /dev/null +++ b/training/kunlunxin/llama2_70B-megatron/config/training_adapter.sh @@ -0,0 +1,61 @@ +export PYTHONPATH=$PYTHONPATH:/home/FlagScale + +MIXED_PRECISION_ARGS="" + +CODE_PATH="/home/FlagScale/pretrain_llama.py" + +TRAINING_ARGS=" + --train-samples $TRAIN_SAMPLES \ + --eval-iters 0 \ + --tensor-model-parallel-size $TP \ + --pipeline-model-parallel-size $PP \ + --micro-batch-size $M_BATCHSIZE \ + --global-batch-size $G_BATCHSIZE \ + --disable-bias-linear \ + --optimizer adam \ + --no-gradient-accumulation-fusion \ + --recompute-granularity 'full' \ + --recompute-num-layers 1 \ + --recompute-method 'uniform' \ + --no-async-tensor-model-parallel-allreduce \ + --distribute-saved-activations +" +NETWORK_ARGS=" + --num-layers 80 \ + --hidden-size 8192 \ + --num-attention-heads 64 \ + --ffn-hidden-size 28672 \ + --seq-length $SEQLENGTH \ + --max-position-embeddings $SEQLENGTH \ + --normalization RMSNorm \ + --group-query-attention \ + --num-query-groups 8 \ + --use-rotary-position-embeddings \ + --no-position-embedding \ + --swiglu \ + --multiple-of 4096 \ + --untie-embeddings-and-output-weights +" + + +export BKCL_CCIX_BUFFER_GM=1 +export BKCL_CCIX_RING=1 +export BKCL_TREE_THRESHOLD=1 + +export BKCL_SOCKET_IFNAME=ibs11 +export BKCL_USE_RDMA=0 + +export BKCL_RDMA_FORCE_TREE=1 +export BKCL_ENABLE_XDR=0 +export BKCL_RING_BUFFER_SIZE=1024000 +export BKCL_RDMA_NICS=ibs11 +export BKCL_FORCE_ALLREDUCE_IN_MULTINODE=1 +worker_num=0 + +ulimit -c 0 +export XMLIR_F_XPU_ENABLED_BOOL=true +export ALLREDUCE_ASYNC=false +export ALLGATHER_ASYNC=false +export ALLREDUCE_FUSION=0 +export BKCL_TIMEOUT=1800 +export BKCL_FORCE_SYNC=1 \ No newline at end of file diff --git a/training/metax/bert_hf-pytorch/README.md b/training/metax/bert_hf-pytorch/README.md index 931958679..0b23692f7 100644 --- a/training/metax/bert_hf-pytorch/README.md +++ b/training/metax/bert_hf-pytorch/README.md @@ -1,4 +1,4 @@ -### Nvidia GPU配置与运行信息参考 +### 沐曦集成电路 C500 GPU配置与运行信息参考 #### 环境配置 - ##### 硬件环境 - 机器、加速卡型号: 曦云®C500 64G diff --git a/training/metax/detr-pytorch/README.md b/training/metax/detr-pytorch/README.md new file mode 100644 index 000000000..a16e7260d --- /dev/null +++ b/training/metax/detr-pytorch/README.md @@ -0,0 +1,45 @@ +### 测试数据集下载 +参见[测试数据集下载](../../benchmarks/detr/README.md#测试数据集下载地址) + +### 沐曦集成电路 C500 GPU配置与运行信息参考 +#### 环境配置 +- ##### 硬件环境 + - 机器、加速卡型号: 曦云®C500 64G + - 多机网络类型、带宽: InfiniBand,2x200 Gb/s + +- ##### 软件环境 + - OS版本:Ubuntu 20.04.6 + - OS kernel版本: 5.4.0-26-generic + - 加速卡驱动版本:2.2.0 + - Docker 版本:24.0.7 + - 训练框架版本:pytorch-2.0.0+mc2.19.0.6-cp38-cp38-linux_x86_64.whl + - 依赖软件版本:无 + +#### 运行情况 + +* 通用指标 + +| 指标名称 | 指标值 | 特殊说明 | +| -------------- | ------------------------------ | ------------------------------------------- | +| 任务类别 | 目标检测、全景分割 | | +| 模型 | detr | | +| 数据集 | coco2017 | | +| 数据精度 | precision,见“性能指标” | 可选fp32/amp/fp16 | +| 超参修改 | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 | +| 硬件设备简称 | nvidia A100 | | +| 硬件存储使用 | mem(actual/total),见“性能指标” | 通常称为“显存”,单位为GiB | +| 端到端时间 | e2e_time,见“性能指标” | 总时间+Perf初始化等时间 | +| 总吞吐量 | p_whole,见“性能指标” | 实际训练样本数除以总时间(performance_whole) | +| 训练吞吐量 | p_train,见“性能指标” | 不包含每个epoch末尾的评估部分耗时 | +| **计算吞吐量** | **p_core,见“性能指标”** | 不包含数据IO部分的耗时(p3>p2>p1) | +| 训练结果 | mAP,见“性能指标” | mean Average Precision | +| 额外修改项 | 无 | | + + +* 性能指标 + +| 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | mAP | mem | +| ------------------- | --------- | -------------- | -------- | ------- | ------- | ------ | ----- | --------- | +| MXC500 单机8卡(1x8) | fp32 | bs=8,lr=0.0001 | | | | |39.6%| 57.2/64.0 | +| MXC500 单机单卡(1x1)| fp32 | / | | | | | | 60.7/64.0 | +| MXC500 两机16卡(2x8) | fp32 | / | | | | | | 46.3/64.0 | diff --git a/training/metax/detr-pytorch/config/config_C500x1x1.py b/training/metax/detr-pytorch/config/config_C500x1x1.py new file mode 100644 index 000000000..f09c8b6c3 --- /dev/null +++ b/training/metax/detr-pytorch/config/config_C500x1x1.py @@ -0,0 +1,4 @@ +from config_common import * + +train_batch_size = 8 +eval_batch_size = 8 \ No newline at end of file diff --git a/training/metax/detr-pytorch/config/config_C500x1x8.py b/training/metax/detr-pytorch/config/config_C500x1x8.py new file mode 100644 index 000000000..f09c8b6c3 --- /dev/null +++ b/training/metax/detr-pytorch/config/config_C500x1x8.py @@ -0,0 +1,4 @@ +from config_common import * + +train_batch_size = 8 +eval_batch_size = 8 \ No newline at end of file diff --git a/training/metax/detr-pytorch/config/config_C500x2x8.py b/training/metax/detr-pytorch/config/config_C500x2x8.py new file mode 100644 index 000000000..f09c8b6c3 --- /dev/null +++ b/training/metax/detr-pytorch/config/config_C500x2x8.py @@ -0,0 +1,4 @@ +from config_common import * + +train_batch_size = 8 +eval_batch_size = 8 \ No newline at end of file diff --git a/training/metax/detr-pytorch/config/config_common.py b/training/metax/detr-pytorch/config/config_common.py new file mode 100644 index 000000000..851b29d4e --- /dev/null +++ b/training/metax/detr-pytorch/config/config_common.py @@ -0,0 +1,2 @@ +vendor = "metax" +dist_backend = "nccl" \ No newline at end of file diff --git a/training/metax/detr-pytorch/config/environment_variables.sh b/training/metax/detr-pytorch/config/environment_variables.sh new file mode 100644 index 000000000..a7f429ac2 --- /dev/null +++ b/training/metax/detr-pytorch/config/environment_variables.sh @@ -0,0 +1,5 @@ +# ================================================= +# Export variables +# ================================================= + +export METAX_USE_TF32=1 diff --git a/training/metax/detr-pytorch/config/requirements.txt b/training/metax/detr-pytorch/config/requirements.txt new file mode 100644 index 000000000..061205713 --- /dev/null +++ b/training/metax/detr-pytorch/config/requirements.txt @@ -0,0 +1,6 @@ +cython +git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI&egg=pycocotools +submitit +scipy +onnx +onnxruntime \ No newline at end of file diff --git a/training/metax/detr-pytorch/extern/.gitkeep b/training/metax/detr-pytorch/extern/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/training/metax/glm-pytorch/README.md b/training/metax/glm-pytorch/README.md new file mode 100644 index 000000000..0eaff257b --- /dev/null +++ b/training/metax/glm-pytorch/README.md @@ -0,0 +1,46 @@ +### 模型Checkpoint下载 +[模型Checkpoint下载](../../benchmarks/glm/README.md#模型checkpoint) +### 测试数据集下载 +[测试数据集下载](../../benchmarks/glm/README.md#数据集) + +### 沐曦集成电路 C500 GPU配置与运行信息参考 +#### 环境配置 +- ##### 硬件环境 + - 机器、加速卡型号: 曦云®C500 64G + - 多机网络类型、带宽: InfiniBand,2x200 Gb/s +- ##### 软件环境 + - OS版本:Ubuntu 20.04.6 + - OS kernel版本: 5.4.0-26-generic + - 加速卡驱动版本:2.2.0 + - Docker 版本:24.0.7 + - 训练框架版本:pytorch-2.0.0+mc2.18.0.8-cp38-cp38-linux_x86_64.whl + - 依赖软件版本:无 + + +### 运行情况 +* 通用指标 + +| 指标名称 | 指标值 | 特殊说明 | +| ---------------- | -------------------------------------------- | ------------------------------------------- | +| 任务类别 | 自然语言理解、无条件文本生成、有条件文本生成 | | +| 模型 | GLM | | +| 数据集 | superglue | | +| 数据精度 | precision,见“性能指标” | 可选fp32/amp/fp16 | +| 超参修改 | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 | +| 硬件设备简称 | MXC500 | | +| 硬件存储使用 | mem(actual/total),见“性能指标” | 通常称为“显存”,单位为GiB | +| 端到端时间 | e2e_time,见“性能指标” | 总时间+Perf初始化等时间 | +| 总吞吐量 | p_whole,见“性能指标” | 实际训练样本数除以总时间(performance_whole) | +| 训练吞吐量 | p_train,见“性能指标” | 不包含每个epoch末尾的评估部分耗时 | +| **计算吞吐量** | **p_core,见“性能指标”** | 不包含数据IO部分的耗时(p3>p2>p1) | +| **计算卡使用率** | **\*MFU** | model flops utilization | +| 训练结果 | acc,见“性能指标” | 准确率 | +| 额外修改项 | 无 | | + +* 性能指标 + +| 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | acc | mem | MFU | +| ------------------- | --------- | --------------- | -------- | ------- | ------- | ------ | ----- | --------- | ----- | +| C500单机8卡(1x8) | fp32 | / | | | | | 0.802 | 54.5/64.0 | | +| C500单机单卡(1x1) | fp32 | / | | | | | / | 50.4/64.0 | | +| C500两机16卡(2x8) | fp32 | / | | | | | / | 29.8/64.0 | | diff --git a/training/metax/glm-pytorch/config/config_C500x1x1.py b/training/metax/glm-pytorch/config/config_C500x1x1.py new file mode 100644 index 000000000..3c1125e57 --- /dev/null +++ b/training/metax/glm-pytorch/config/config_C500x1x1.py @@ -0,0 +1,19 @@ +train_batch_size = 16 +eval_batch_size = 16 + +max_samples_termination = 24135 + +dist_backend = "nccl" + +lr = 1e-5 +weight_decay = 0.1 +adam_beta1 = 0.9 +adam_beta2 = 0.999 +adam_eps = 1e-08 +gradient_accumulation_steps = 1 +warmup = 0.1 +lr_decay_ratio = 0.1 +lr_decay_iters = 4338 +log_freq = 1 + +training_event = None diff --git a/training/metax/glm-pytorch/config/config_C500x1x8.py b/training/metax/glm-pytorch/config/config_C500x1x8.py new file mode 100644 index 000000000..3cc3e2723 --- /dev/null +++ b/training/metax/glm-pytorch/config/config_C500x1x8.py @@ -0,0 +1,18 @@ +train_batch_size = 16 +eval_batch_size = 16 + +dist_backend = "nccl" + +lr = 1e-5 +weight_decay = 0.1 +adam_beta1 = 0.9 +adam_beta2 = 0.999 +adam_eps = 1e-08 +gradient_accumulation_steps = 1 +warmup = 0.1 +lr_decay_ratio = 0.1 +lr_decay_iters = 4338 +log_freq = 1 +seed = 10483 +max_samples_termination = 5553080 +training_event = None diff --git a/training/metax/glm-pytorch/config/config_C500x2x8.py b/training/metax/glm-pytorch/config/config_C500x2x8.py new file mode 100644 index 000000000..a40988fed --- /dev/null +++ b/training/metax/glm-pytorch/config/config_C500x2x8.py @@ -0,0 +1,22 @@ +fp16 = True +ddp_type = "apex" +train_batch_size = 8 +eval_batch_size = 8 + +dist_backend = "nccl" + +lr = 1e-5 +weight_decay = 0.1 +adam_beta1 = 0.9 +adam_beta2 = 0.999 +adam_eps = 1e-08 +gradient_accumulation_steps = 1 +warmup = 0.1 +lr_decay_ratio = 0.1 +lr_decay_iters = 4338 +log_freq = 1 + +training_event = None + +max_samples_termination = 1388270 * 4 +target_accuracy = 0.8 diff --git a/training/metax/glm-pytorch/config/requirements.txt b/training/metax/glm-pytorch/config/requirements.txt new file mode 100644 index 000000000..3adfcca6c --- /dev/null +++ b/training/metax/glm-pytorch/config/requirements.txt @@ -0,0 +1,3 @@ +h5sparse +boto3 +h5py diff --git a/training/metax/glm-pytorch/extern/converter.py b/training/metax/glm-pytorch/extern/converter.py new file mode 100644 index 000000000..330bee79a --- /dev/null +++ b/training/metax/glm-pytorch/extern/converter.py @@ -0,0 +1,21 @@ +from driver import dist_pytorch +from .layers.transformer import GLMTransformer + + +def convert_model(model, config): + if dist_pytorch.get_rank() == 0: + print("use apex layer norm", flush=True) + state_dict = model.state_dict() + transformer_layer = GLMTransformer( + num_layers=config.num_layers, + hidden_size=config.hidden_size, + num_attention_heads=config.num_attention_heads, + max_sequence_length=config.max_seq_length, + max_memory_length=config.max_memory_length, + embedding_dropout_prob=config.hidden_dropout, + attention_dropout_prob=config.attention_dropout, + output_dropout_prob=config.hidden_dropout, + checkpoint_activations=config.checkpoint_activations) + model.model.transformer = transformer_layer + model.load_state_dict(state_dict, strict=True) + return model diff --git a/training/metax/glm-pytorch/extern/layers/__init__.py b/training/metax/glm-pytorch/extern/layers/__init__.py new file mode 100644 index 000000000..dab9da7e4 --- /dev/null +++ b/training/metax/glm-pytorch/extern/layers/__init__.py @@ -0,0 +1 @@ +from .transformer import * diff --git a/training/metax/glm-pytorch/extern/layers/layernorm.py b/training/metax/glm-pytorch/extern/layers/layernorm.py new file mode 100644 index 000000000..96a935cc6 --- /dev/null +++ b/training/metax/glm-pytorch/extern/layers/layernorm.py @@ -0,0 +1 @@ +from apex.normalization import FusedLayerNorm as LayerNorm diff --git a/training/metax/glm-pytorch/extern/layers/transformer.py b/training/metax/glm-pytorch/extern/layers/transformer.py new file mode 100644 index 000000000..82e98ed26 --- /dev/null +++ b/training/metax/glm-pytorch/extern/layers/transformer.py @@ -0,0 +1,298 @@ +import torch +import math + +from .transformer_block import GLMTransformerLayer +from .layernorm import LayerNorm +from model.models.checkpoint import checkpoint + + +def scaled_init_method(sigma, num_layers): + """Init method based on N(0, sigma/sqrt(2*num_layers).""" + std = sigma / math.sqrt(2.0 * num_layers) + + def init_(tensor): + return torch.nn.init.normal_(tensor, mean=0.0, std=std) + + return init_ + + +def unscaled_init_method(sigma): + """Init method based on N(0, sigma).""" + + def init_(tensor): + return torch.nn.init.normal_(tensor, mean=0.0, std=sigma) + + return init_ + + +class GLMTransformer(torch.nn.Module): + """GPT-2 transformer. + + This module takes input from embedding layer and it's output can + be used directly by a logit layer. It consists of L (num-layers) + blocks of: + layer norm + self attention + residual connection + layer norm + mlp + residual connection + followed by a final layer norm. + + Arguments: + num_layers: Number of transformer layers. + hidden_size: The hidden size of the self attention. + num_attention_heads: number of attention head in the self + attention. + attention_dropout_prob: dropout probability of the attention + score in self attention. + output_dropout_prob: dropout probability for the outputs + after self attention and final output. + checkpoint_activations: if True, checkpoint activations. + checkpoint_num_layers: number of layers to checkpoint. This + is basically the chunk size in checkpoitning. + layernorm_epsilon: epsilon used in layernorm to avoid + division by zero. + init_method_std: standard deviation of the init method which has + the form N(0, std). + use_scaled_init_for_output_weights: If Ture use 1/sqrt(2*num_layers) + scaling for the output weights ( + output of self attention and mlp). + """ + + def __init__( + self, + num_layers, + hidden_size, + num_attention_heads, + max_sequence_length, + max_memory_length, + embedding_dropout_prob, + attention_dropout_prob, + output_dropout_prob, + checkpoint_activations, + checkpoint_num_layers=1, + layernorm_epsilon=1.0e-5, + init_method_std=0.02, + use_scaled_init_for_output_weights=True, + block_position_encoding=True, + attention_scale=1.0, + ): + super(GLMTransformer, self).__init__() + self.hidden_size = hidden_size + # Store activation checkpoiting flag. + self.checkpoint_activations = checkpoint_activations + self.checkpoint_num_layers = checkpoint_num_layers + self.max_memory_length = max_memory_length + + output_layer_init_method = None + if use_scaled_init_for_output_weights: + output_layer_init_method = scaled_init_method( + init_method_std, num_layers) + # Embeddings dropout + self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob) + self.block_position_encoding = block_position_encoding + + # Position embedding (serial). + if block_position_encoding: + self.position_embeddings = torch.nn.Embedding( + max_sequence_length + 1, hidden_size) + self.block_position_embeddings = torch.nn.Embedding( + max_sequence_length + 1, hidden_size) + torch.nn.init.normal_(self.block_position_embeddings.weight, + mean=0.0, + std=init_method_std) + else: + self.position_embeddings = torch.nn.Embedding( + max_sequence_length, hidden_size) + # Initialize the position embeddings. + torch.nn.init.normal_(self.position_embeddings.weight, + mean=0.0, + std=init_method_std) + + def get_layer(): + + return GLMTransformerLayer( + hidden_size, + num_attention_heads, + attention_dropout_prob, + output_dropout_prob, + layernorm_epsilon, + unscaled_init_method(init_method_std), + output_layer_init_method=output_layer_init_method, + relative_encoding=False, + performer=False, + attention_scale=attention_scale) + + # Transformer layers. + self.layers = torch.nn.ModuleList( + [get_layer() for _ in range(num_layers)]) + + # Final layer norm before output. + self.final_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon) + + # if deepspeed.checkpointing.is_configured(): + # global get_cuda_rng_tracker, checkpoint + # get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker + # checkpoint = deepspeed.checkpointing.checkpoint + + def forward(self, + hidden_states, + position_ids, + attention_mask, + memory_states=None, + encoder_states=None, + return_memory=False, + detach_memory=True): + batch_size, query_length = hidden_states.size()[:2] + memory_length = memory_states[0].size(1) if memory_states else 0 + key_length = query_length + memory_length + # attention mask is the beginning postion of B region, \in [0, query_len) + is_scalar = torch.numel(attention_mask) == 1 + is_sep = is_scalar or torch.numel(attention_mask) == batch_size + + if is_sep: + sep = attention_mask.item() if is_scalar else attention_mask + + # conventional transformer + def build_mask_matrix(seq_length, sep, memory_length=0): + m = hidden_states.new_ones((1, seq_length, seq_length)) + m = torch.tril(m) + if is_scalar: + m[0, :, :sep] = 1 + else: + m = m.expand(batch_size, -1, -1) + ids = torch.arange(seq_length, + device=sep.device, + dtype=sep.dtype).view(1, -1) + mask = ids < sep.view(-1, 1) + m = m.masked_fill(mask.unsqueeze(1).expand_as(m), 1) + if memory_length > 0: + m = m.expand(batch_size, -1, -1) + m = torch.cat((hidden_states.new_ones( + (batch_size, seq_length, memory_length)), m), + dim=2) + m = m.unsqueeze(1) + return m + + attention_mask = build_mask_matrix(query_length, + sep, + memory_length=memory_length) + else: + attention_mask = attention_mask[:, :, :, + -query_length - memory_length:] + + if self.block_position_encoding: + position_ids, block_position_ids = position_ids[:, + 0], position_ids[:, + 1] + position_embeddings = self.position_embeddings(position_ids) + hidden_states = hidden_states + position_embeddings + if self.block_position_encoding: + block_position_embeddings = self.block_position_embeddings( + block_position_ids) + hidden_states = hidden_states + block_position_embeddings + hidden_states = self.embedding_dropout(hidden_states) + + def check_detach(_hidden_states): + if detach_memory: + return _hidden_states.detach() + return _hidden_states + + if self.max_memory_length > 0 or return_memory: + mem_layers = [check_detach(hidden_states)] + else: + mem_layers = [] + + def custom(start, end): + + def custom_forward(*inputs): + layers_ = self.layers[start:end] + x_, inputs = inputs[0], inputs[1:] + + inputs, mems_ = inputs[:1], inputs[1:] + for i, layer in enumerate(layers_): + mem_i_ = mems_[i] if mems_ else None + x_ = layer(x_, *inputs, mem=mem_i_) + if self.max_memory_length > 0 or return_memory: + mem_layers.append(check_detach(x_)) + return x_ + + return custom_forward + + if self.checkpoint_activations: + l = 0 + num_layers = len(self.layers) + chunk_length = self.checkpoint_num_layers + while l < num_layers: + args = [hidden_states, attention_mask] + if memory_states: + args += memory_states[l:l + chunk_length] + hidden_states = checkpoint(custom(l, l + chunk_length), *args) + l += chunk_length + else: + for i, layer in enumerate(self.layers): + args = [hidden_states, attention_mask] + mem_i = memory_states[i] if memory_states else None + hidden_states = layer(*args, mem=mem_i) + if self.max_memory_length > 0 or return_memory: + mem_layers.append(check_detach(hidden_states)) + + # Final layer norm. + output = self.final_layernorm(hidden_states) + if self.max_memory_length > 0 or return_memory: + mem_layers = self.update_mems(mem_layers, + memory_states, + return_memory=return_memory) + + return (output, mem_layers) + + def update_mems(self, hiddens, mems, return_memory=False): + memory_length = mems[0].size(1) if mems else 0 + query_length = hiddens[0].size(1) + new_memory_length = memory_length + query_length + if not return_memory: + new_memory_length = min(self.max_memory_length, new_memory_length) + new_mems = [] + # with torch.no_grad(): + for i in range(len(hiddens)): + if new_memory_length <= query_length: + new_mems.append(hiddens[i][:, -new_memory_length:]) + else: + new_mems.append( + torch.cat((mems[i][:, -new_memory_length + query_length:], + hiddens[i]), + dim=1)) + return new_mems + + +if __name__ == "__main__": + + batch_size = 2 + seq_len = 512 + hidden_size = 1024 + hidden_states = torch.rand([batch_size, seq_len, hidden_size], + dtype=torch.float32).to("cuda") + position_ids = torch.ones([batch_size, 2, seq_len], + dtype=torch.int64).to('cuda') + attention_mask = torch.tensor([5, 10]).to('cuda') + + model = GLMTransformer(num_layers=24, + hidden_size=1024, + num_attention_heads=16, + max_sequence_length=512, + max_memory_length=0, + embedding_dropout_prob=0.1, + attention_dropout_prob=0.1, + output_dropout_prob=0.1, + checkpoint_activations=True, + checkpoint_num_layers=1, + layernorm_epsilon=1.0e-5, + init_method_std=0.02, + use_scaled_init_for_output_weights=True, + block_position_encoding=True, + attention_scale=1.0).to('cuda') + + outputs = model(hidden_states, position_ids, attention_mask) + print(outputs[0].shape) + print(outputs[1]) diff --git a/training/metax/glm-pytorch/extern/layers/transformer_block.py b/training/metax/glm-pytorch/extern/layers/transformer_block.py new file mode 100644 index 000000000..c3d33466b --- /dev/null +++ b/training/metax/glm-pytorch/extern/layers/transformer_block.py @@ -0,0 +1,125 @@ +import torch + +from model.layers.attention import SelfAttention +from .layernorm import LayerNorm +from model.layers.mlp import GLMMLP + + +class GLMTransformerLayer(torch.nn.Module): + """A single layer transformer for GPT2. + + We use the following notation: + h: hidden size + n: number of attention heads + b: batch size + s: sequence length + Transformore layer takes input with size [b, s, h] and returns an + output of the same size. + + Arguments: + hidden_size: The hidden size of the self attention. + num_attention_heads: number of attention head in the self + attention. + attention_dropout_prob: dropout probability of the attention + score in self attention. + output_dropout_prob: dropout probability for the outputs + after self attention and final output. + layernorm_epsilon: epsilon used in layernorm to avoid + division by zero. + init_method: initialization method used for the weights. Note + that all biases are initialized to zero and + layernorm weight are initialized to one. + output_layer_init_method: output layers (attention output and + mlp output) initialization. If None, + use `init_method`. + """ + + def __init__(self, + hidden_size, + num_attention_heads, + attention_dropout_prob, + output_dropout_prob, + layernorm_epsilon, + init_method, + output_layer_init_method=None, + relative_encoding=False, + performer=False, + attention_scale=1.0): + super(GLMTransformerLayer, self).__init__() + # Set output layer initialization if not provided. + if output_layer_init_method is None: + output_layer_init_method = init_method + + # Layernorm on the input data. + self.input_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon) + + # Self attention. + self.attention = SelfAttention( + hidden_size, + num_attention_heads, + attention_dropout_prob, + output_dropout_prob, + init_method, + output_layer_init_method=output_layer_init_method, + relative_encoding=relative_encoding, + performer=performer, + attention_scale=attention_scale) + + # Layernorm on the input data. + self.post_attention_layernorm = LayerNorm(hidden_size, + eps=layernorm_epsilon) + + # MLP + self.mlp = GLMMLP(hidden_size, + output_dropout_prob, + init_method, + output_layer_init_method=output_layer_init_method) + + def forward(self, + hidden_states, + ltor_mask, + position_embeddings=None, + r_w_bias=None, + r_r_bias=None, + mem=None): + # hidden_states: [b, s, h] + # ltor_mask: [1, 1, s, s] + + # Layer norm at the begining of the transformer layer. + layernorm_output = self.input_layernorm(hidden_states) + mem = self.input_layernorm(mem) if mem is not None else None + # Self attention. + attention_output = self.attention(layernorm_output, ltor_mask, + position_embeddings, r_w_bias, + r_r_bias, mem) + # Residual connection. + layernorm_input = hidden_states + attention_output + # Layer norm post the self attention. + layernorm_output = self.post_attention_layernorm(layernorm_input) + # MLP. + mlp_output = self.mlp(layernorm_output) + # Second residual connection. + output = layernorm_input + mlp_output + + return output + + +if __name__ == "__main__": + batch_size = 8 + seq_len = 512 + hidden_size = 1024 + num_attention_heads = 16 + attention_dropout_prob = 0.1 + output_dropout_prob = 0.1 + layernorm_epsilon = 1e-10 + init_method = torch.nn.init.xavier_normal_ + test_transformer = GLMTransformerLayer(hidden_size, num_attention_heads, + attention_dropout_prob, + output_dropout_prob, + layernorm_epsilon, init_method) + + hidden_states = torch.rand([batch_size, seq_len, hidden_size]) + ltor_mask = torch.ones([1, 1, seq_len, seq_len]) + + outputs = test_transformer(hidden_states, ltor_mask) + print(outputs.shape) diff --git a/training/metax/glm-pytorch/extern/trainer_adapter.py b/training/metax/glm-pytorch/extern/trainer_adapter.py new file mode 100644 index 000000000..d4c3837a5 --- /dev/null +++ b/training/metax/glm-pytorch/extern/trainer_adapter.py @@ -0,0 +1,80 @@ +import torch +import config + +from torch import nn + +from .converter import convert_model as _convert_model +from driver.dist_pytorch import main_proc_print +from typing import Tuple +from model.models.modeling import FP16_Module +from driver.dist_pytorch import PyTorchDistributedDataParallel as TorchDDP + +from optimizers.loss_scaler import DynamicLossScaler + +clip_grad_norm = torch.nn.utils.clip_grad_norm_ + + +def convert_model(model: torch.nn.Module) -> torch.nn.Module: + return _convert_model(model, config) + + +def model_to_fp16(model): + # To prevent OOM for model sizes that cannot fit in GPU memory in full precision + if config.fp16: + main_proc_print(" > use fp16...") + model.half() + + # GPU allocation. + model.cuda(torch.cuda.current_device()) + + # Fp16 conversion. + if config.fp16: + model = FP16_Module(model) + return model + + +def model_to_ddp(model: nn.Module) -> nn.Module: + i = torch.cuda.current_device() + if torch.distributed.is_available() and torch.distributed.is_initialized(): + model = TorchDDP(model, device_ids=[i], output_device=i) + return model + + +def backward(step, lm_loss, reduced_loss, optimizer, lr_scheduler, model): + args = config + + if not DynamicLossScaler._has_inf_or_nan(reduced_loss): + backward_step(optimizer, model, lm_loss, args) + if step % args.gradient_accumulation_steps == 0: + optimizer.step() + if not (args.fp16 and optimizer.overflow): + lr_scheduler.step() + optimizer.zero_grad() + + else: + main_proc_print("Found NaN loss, skip backward") + return reduced_loss + + +def backward_step(optimizer, model, lm_loss, args): + """Backward step.""" + + # Total loss. + loss = lm_loss + + if args.fp16: + optimizer.backward(loss, update_master_grads=False) + else: + loss.backward() + + if args.fp16: + optimizer.update_master_grads() + + # Clipping gradients helps prevent the exploding gradient. + if args.clip_grad > 0: + if not args.fp16: + clip_grad_norm(model.parameters(), args.clip_grad) + else: + optimizer.clip_master_grads(args.clip_grad) + + return lm_loss diff --git a/training/metax/mask_rcnn-pytorch/README.md b/training/metax/mask_rcnn-pytorch/README.md new file mode 100644 index 000000000..3247c5e42 --- /dev/null +++ b/training/metax/mask_rcnn-pytorch/README.md @@ -0,0 +1,50 @@ +### 模型backbone权重下载 +[模型backbone权重下载](../../benchmarks/mask_rcnn) + +### 测试数据集下载 + +[测试数据集下载](https://cocodataset.org/) + +### 沐曦集成电路 C500 GPU配置与运行信息参考 +#### 环境配置 +- ##### 硬件环境 + - 机器、加速卡型号: 曦云®C500 64G + - 多机网络类型、带宽: InfiniBand,2x200 Gb/s + +- ##### 软件环境 + - OS版本:Ubuntu 20.04.6 + - OS kernel版本: 5.4.0-26-generic + - 加速卡驱动版本:2.2.0 + - Docker 版本:24.0.7 + - 训练框架版本:pytorch-2.0.0+mc2.19.0.6-cp38-cp38-linux_x86_64.whl + - 依赖软件版本:无 + + + + +* 通用指标 + +| 指标名称 | 指标值 | 特殊说明 | +| -------------- | ----------------------- | ------------------------------------------- | +| 任务类别 | 图像目标检测 | | +| 模型 | fasterRCNN | | +| 数据集 | coco2017 | | +| 数据精度 | precision,见“性能指标” | 可选fp32/amp/fp16 | +| 超参修改 | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 | +| 硬件设备简称 | MXC500 | | +| 硬件存储使用 | mem,见“性能指标” | 通常称为“显存”,单位为GiB | +| 端到端时间 | e2e_time,见“性能指标” | 总时间+Perf初始化等时间 | +| 总吞吐量 | p_whole,见“性能指标” | 实际训练图片数除以总时间(performance_whole) | +| 训练吞吐量 | p_train,见“性能指标” | 不包含每个epoch末尾的评估部分耗时 | +| **计算吞吐量** | **p_core,见“性能指标”** | 不包含数据IO部分的耗时(p3>p2>p1) | +| 训练结果 | map,见“性能指标” | 单位为平均目标检测正确率 | +| 额外修改项 | 无 | | + + +* 性能指标 + +| 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | mAP | mem | +| ------------------- | --------- | -------------- | -------- | ------- | ------- | ------ | ----- | --------- | +| MXC500 单机8卡(1x8) | fp32 | bs=8,lr=0.0001 | | | | |0.382 && 0.343| 37.1/64.0 | +| MXC500 单机单卡(1x1)| fp32 | / | | | | | | 36.2/64.0 | +| MXC500 两机16卡(2x8) | fp32 | / | | | | | | 37.1/64.0 | diff --git a/training/metax/mask_rcnn-pytorch/config/config_C500x1x1.py b/training/metax/mask_rcnn-pytorch/config/config_C500x1x1.py new file mode 100644 index 000000000..b14441400 --- /dev/null +++ b/training/metax/mask_rcnn-pytorch/config/config_C500x1x1.py @@ -0,0 +1,5 @@ +vendor: str = "metax" +train_batch_size = 16 +eval_batch_size = 16 +lr = 0.16 +max_epoch: int = 1 \ No newline at end of file diff --git a/training/metax/mask_rcnn-pytorch/config/config_C500x1x8.py b/training/metax/mask_rcnn-pytorch/config/config_C500x1x8.py new file mode 100644 index 000000000..c11690f00 --- /dev/null +++ b/training/metax/mask_rcnn-pytorch/config/config_C500x1x8.py @@ -0,0 +1,4 @@ +vendor: str = "metax" +train_batch_size = 16 +eval_batch_size = 16 +lr = 0.16 \ No newline at end of file diff --git a/training/metax/mask_rcnn-pytorch/config/config_C500x2x8.py b/training/metax/mask_rcnn-pytorch/config/config_C500x2x8.py new file mode 100644 index 000000000..e81bc64bb --- /dev/null +++ b/training/metax/mask_rcnn-pytorch/config/config_C500x2x8.py @@ -0,0 +1,5 @@ +vendor: str = "metax" +train_batch_size = 16 +eval_batch_size = 16 +lr = 0.016 +max_epoch: int = 4 \ No newline at end of file diff --git a/training/metax/mask_rcnn-pytorch/config/environment_variables.sh b/training/metax/mask_rcnn-pytorch/config/environment_variables.sh new file mode 100644 index 000000000..a7f429ac2 --- /dev/null +++ b/training/metax/mask_rcnn-pytorch/config/environment_variables.sh @@ -0,0 +1,5 @@ +# ================================================= +# Export variables +# ================================================= + +export METAX_USE_TF32=1 diff --git a/training/metax/mask_rcnn-pytorch/config/requirements.txt b/training/metax/mask_rcnn-pytorch/config/requirements.txt new file mode 100644 index 000000000..846b45e40 --- /dev/null +++ b/training/metax/mask_rcnn-pytorch/config/requirements.txt @@ -0,0 +1,4 @@ +pycocotools +numpy +tqdm +schedule \ No newline at end of file diff --git a/training/metax/mask_rcnn-pytorch/extern/.gitkeep b/training/metax/mask_rcnn-pytorch/extern/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py index 8b12afec9..27d0d79b0 100644 --- a/training/run_benchmarks/config/test_conf.py +++ b/training/run_benchmarks/config/test_conf.py @@ -134,7 +134,8 @@ # "transformer:pytorch:R300:1:8:1": "/raid/dataset/transformer/wmt14_en_de_joined_dict", # "bigtransfer:pytorch:R300:1:8:1": "/raid/dataset/ImageNet_1k_2012/", # "efficientnet:pytorch:R300:1:8:1": "/raid/dataset/ImageNet_1k_2012/", - + # "llama2_70B:megatron:R300:10:8:1": "/raid/dataset/llama2_70B_pretrain", + # iluvatar cases # "bigtransfer:pytorch:BI-V100:1:8:1": "/raid/dataset/ImageNet_1k_2012/", # "vit:pytorch:BI-V100:1:8:1": "/raid/dataset/ImageNet_1k_2012/", @@ -154,6 +155,10 @@ # "resnet50:pytorch_2.0:C500:1:8:1": "/raid/dataset/ImageNet_1k_2012/", # "swin_transformer:pytorch_2.0:C500:1:8:1": "/raid/dataset/ImageNet_1k_2012/", # "WaveGlow:pytorch_2.0:C500:1:8:1": "/raid/dataset/LJSpeech/", + # "bert_hf:pytorch_2.0:C500:1:8:1": "/raid/dataset/bert_hf_train", + # "glm:pytorch_2.0:C500:1:8:1": "/raid/home_datasets_ckpt/glm/train/", # "mobilenetv2:pytorch_2.0:C500:1:8:1": "/raid/dataset/ImageNet_1k_2012/", + # "mask_rcnn:pytorch_2.0:C500:1:8:1": "/raid/dataset/coco2017/", + # "detr:pytorch_2.0:C500:1:8:1": "/raid/dataset/coco2017/", }