Skip to content

Commit

Permalink
Merge branch 'main' into moflow
Browse files Browse the repository at this point in the history
  • Loading branch information
yuzhou03 authored Mar 5, 2024
2 parents 285bb7f + 9b0280a commit e019f7a
Show file tree
Hide file tree
Showing 41 changed files with 1,219 additions and 4 deletions.
2 changes: 2 additions & 0 deletions inference/benchmarks/stable_diffusion_v1_4/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,5 @@
| kunlunxin_xtcl | fp32 | 2 | / | / | / | / | / | / | 26.524/25.3 | 0.07/32.0 |
| null | fp16 | 16 | / | 11.7 | 60.7 | / | / | 13.2% | -/25.2 | 5.7/40.0 |
| null | fp32 | 8 | / | 9.3 | 27.3 | / | / | 11.9% | -/25.3 | 6.3/40.0 |
| metax-nocompiler | fp16 | 64 | / | / | / | / | / | 12.7% | -/25.4 | 14.7/64.0 |
| metax-nocompiler | fp32 | 16 | / | / | / | / | / | 10.3% | -/25.4 | 55.57/64.0 |
5 changes: 4 additions & 1 deletion inference/configs/host.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,13 @@ SSH_PORT: "22"
HOSTS_PORTS: ["2222"]
MASTER_PORT: "29501"
SHM_SIZE: "32G"
# metax:
# " --device=/dev/dri --device=/dev/mxcd --group-add video"
ACCE_CONTAINER_OPT: " --gpus all"
PIP_SOURCE: "https://mirror.baidu.com/pypi/simple"
CLEAR_CACHES: True
ACCE_VISIBLE_DEVICE_ENV_NAME: "CUDA_VISIBLE_DEVICES"
CASES:
# "resnet50:pytorch_1.13": "/raid/dataset/ImageNet/imagenet/val"
"vit_l_16:pytorch_2.1": "/raid/dataset/ImageNet_1k_2012/val"
"vit_l_16:pytorch_2.1": "/raid/dataset/ImageNet_1k_2012/val"
#"stable_diffusion_v1_4:pytorch_2.0": "/raid/dataset/stable_diffusion_v1_4/"
17 changes: 17 additions & 0 deletions inference/docker_images/metax/metax_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
def analysis_log(logpath):
logfile = open(logpath)

max_usage = 0.0
max_mem = 0.0
for line in logfile.readlines():
if "MiB" in line:

usage_and_maxusage = line.split(" ")[2]
usage = float(usage_and_maxusage.split("/")[0])
max_usage = max(max_usage, usage)
max_mem = float(usage_and_maxusage.split("/")[1])
#max_mem = float(max_mem[:-3])
print (max_mem)
print (max_usage)
return round(max_usage / 1024.0,
2), round(max_mem / 1024.0, 2), eval("120e12"), eval("240e12")
256 changes: 256 additions & 0 deletions inference/docker_images/metax/metax_monitor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,256 @@
# !/usr/bin/env python3
# encoding: utf-8
'''
Usage: python3 sys-monitor.py -o operation -l [log_path]
-o, --operation start|stop|restart|status
-l, --log log path , ./logs/ default
'''

import os
import sys
import time
import signal
import atexit
import argparse
import datetime
from multiprocessing import Process
import subprocess
import schedule


class Daemon:
'''
daemon subprocess class.
usage: subclass this daemon and override the run() method.
sys-monitor.pid: in the /tmp/, auto del when unexpected exit.
verbose: debug mode, disabled default.
'''

def __init__(self,
pid_file,
log_file,
err_file,
gpu_log,
log_path,
rate=5,
stdin=os.devnull,
stdout=os.devnull,
stderr=os.devnull,
home_dir='.',
umask=0o22,
verbose=0):
self.stdin = stdin
self.stdout = stdout
self.stderr = stderr
self.home_dir = home_dir
self.verbose = verbose
self.pidfile = pid_file
self.logfile = log_file
self.errfile = err_file
self.gpufile = gpu_log
self.logpath = log_path
self.rate = rate
self.umask = umask
self.verbose = verbose
self.daemon_alive = True

def get_pid(self):
try:
with open(self.pidfile, 'r') as pf:
pid = int(pf.read().strip())
except IOError:
pid = None
except SystemExit:
pid = None
return pid

def del_pid(self):
if os.path.exists(self.pidfile):
os.remove(self.pidfile)

def run(self):
'''
NOTE: override the method in subclass
'''

def gpu_mon(file):
TIMESTAMP = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
cmd = "mx-smi |grep 'W' -m 1 | awk '{print $2, $3, $5,$6}' && mx-smi |grep 'MXC' -m 1 | awk '{print $7}'"
process = subprocess.Popen(cmd,
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
encoding='utf-8')
try:
out = process.communicate(timeout=10)
except subprocess.TimeoutExpired:
process.kill()
out = process.communicate()

if process.returncode != 0:
result = "error"
result = TIMESTAMP + "\n" + out[0] + "\n"
with open(file, 'a') as f:
f.write(result)

def timer_gpu_mon():
gpu_process = Process(target=gpu_mon, args=(self.gpufile, ))
gpu_process.start()

schedule.every(self.rate).seconds.do(timer_gpu_mon)
while True:
schedule.run_pending()
time.sleep(5)

def daemonize(self):
if self.verbose >= 1:
print('daemon process starting ...')
try:
pid = os.fork()
if pid > 0:
sys.exit(0)
except OSError as e:
sys.stderr.write('fork #1 failed: %d (%s)\n' %
(e.errno, e.strerror))
sys.exit(1)
os.chdir(self.home_dir)
os.setsid()
os.umask(self.umask)
try:
pid = os.fork()
if pid > 0:
sys.exit(0)
except OSError as e:
sys.stderr.write('fork #2 failed: %d (%s)\n' %
(e.errno, e.strerror))
sys.exit(1)
sys.stdout.flush()
sys.stderr.flush()
si = open(self.stdin, 'r')
so = open(self.stdout, 'a+')
if self.stderr:
se = open(self.stderr, 'a+')
else:
se = so
os.dup2(si.fileno(), sys.stdin.fileno())
os.dup2(so.fileno(), sys.stdout.fileno())
os.dup2(se.fileno(), sys.stderr.fileno())
atexit.register(self.del_pid)
pid = str(os.getpid())
with open(self.pidfile, 'w+') as f:
f.write('%s\n' % pid)

def start(self):
if not os.path.exists(self.logpath):
os.makedirs(self.logpath)
elif os.path.exists(self.gpufile):
os.remove(self.gpufile)
if self.verbose >= 1:
print('ready to start ......')
# check for a pid file to see if the daemon already runs
pid = self.get_pid()
if pid:
msg = 'pid file %s already exists, is it already running?\n'
sys.stderr.write(msg % self.pidfile)
sys.exit(1)
# start the daemon
self.daemonize()
self.run()

def stop(self):
if self.verbose >= 1:
print('stopping ...')
pid = self.get_pid()
if not pid:
msg = 'pid file [%s] does not exist. Not running?\n' % self.pidfile
sys.stderr.write(msg)
if os.path.exists(self.pidfile):
os.remove(self.pidfile)
return
# try to kill the daemon process
try:
i = 0
while 1:
os.kill(pid, signal.SIGTERM)
time.sleep(1)
i = i + 1
if i % 10 == 0:
os.kill(pid, signal.SIGHUP)
except OSError as err:
err = str(err)
if err.find('No such process') > 0:
if os.path.exists(self.pidfile):
os.remove(self.pidfile)
else:
print(str(err))
sys.exit(1)
if self.verbose >= 1:
print('Stopped!')

def restart(self):
self.stop()
self.start()

def status(self):
pid = self.get_pid()
if pid:
if os.path.exists('/proc/%d' % pid):
return pid
return False


def parse_args():
''' Check script input parameter. '''
parse = argparse.ArgumentParser(description='Sys monitor script')
parse.add_argument('-o',
type=str,
metavar='[operation]',
required=True,
help='start|stop|restart|status')
parse.add_argument('-l',
type=str,
metavar='[log_path]',
required=False,
default='./logs/',
help='log path')
args = parse.parse_args()
return args


def main():
sample_rate1 = 5
args = parse_args()
operation = args.o
log_path = args.l
pid_fn = str('/tmp/gpu_monitor.pid')
log_fn = str(log_path + '/metax_monitor.log')
err_fn = str(log_path + '/metax_monitor.err')
# result for gpu
gpu_fn = str(log_path + '/metax_monitor.log')

subdaemon = Daemon(pid_fn,
log_fn,
err_fn,
gpu_fn,
log_path,
verbose=1,
rate=sample_rate1)
if operation == 'start':
subdaemon.start()
elif operation == 'stop':
subdaemon.stop()
elif operation == 'restart':
subdaemon.restart()
elif operation == 'status':
pid = subdaemon.status()
if pid:
print('process [%s] is running ......' % pid)
else:
print('daemon process [%s] stopped' % pid)
else:
print("invalid argument!")
sys.exit(1)


if __name__ == '__main__':
main()
11 changes: 11 additions & 0 deletions inference/docker_images/metax/pytorch_2.0/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
FROM mxcr.io/library/maca-c500-pytorch:2.19.2.5-ubuntu18.04-amd64
ENV PATH="/opt/conda/bin:${PATH}"
ENV PYTORCH_USE_FLASHATTN=1
RUN pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
RUN pip install numpy
RUN pip install pyyaml
RUN pip install timm
RUN pip install munch
RUN pip install schedule
RUN pip install loguru
RUN /bin/bash -c "uname -a"
4 changes: 3 additions & 1 deletion training/benchmarks/llama2_70B/megatron/megatron_main.sh
Original file line number Diff line number Diff line change
Expand Up @@ -131,8 +131,10 @@ LOGGING_ARGS="
--log-interval 1
"

CODE_PATH="/workspace/FlagScale/pretrain_llama.py"

source $VENDOR_SHELL
cmd="torchrun $DISTRIBUTED_ARGS /workspace/FlagScale/pretrain_llama.py \
cmd="torchrun $DISTRIBUTED_ARGS $CODE_PATH \
$TRAINING_ARGS \
$MIXED_PRECISION_ARGS \
$DATA_ARGS \
Expand Down
6 changes: 6 additions & 0 deletions training/kunlunxin/docker_image/megatron/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
FROM iregistry.baidu-int.com/xmlir/xmlir_ubuntu_2004_x86_64:v0.27
RUN /bin/bash -c "pip config set global.index-url https://mirror.baidu.com/pypi/simple"
RUN /bin/bash -c "uname -a"
RUN /bin/bash -c alias python3=python

ENV PATH /root/miniconda/envs/python38_torch201_cuda/bin:$PATH
14 changes: 14 additions & 0 deletions training/kunlunxin/docker_image/megatron/megatron_install.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/bash
# using github mirrors to avoid github TTL
#export https_proxy=http://10.1.0.34:7890
git clone https://githubfast.com/FlagOpen/FlagScale
cd FlagScale

git checkout eb0438a5459404e2e4c70b15fa37e9a197ab159d
echo 'export PYTHONPATH=$PYTHONPATH:/home/FlagScale' >> /root/.bashrc
source /root/.bashrc

wget https://bd.bcebos.com/v1/klx-pytorch-work-bd/training/zhangling21_llama70B/xmlir201_5.run
bash xmlir201_5.run
XFLAGS --enable transformer_engine
XFLAGS --enable flagscale
49 changes: 49 additions & 0 deletions training/kunlunxin/llama2_70B-megatron/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
### 昆仑芯XPU配置与运行信息参考
#### 环境配置
- ##### 硬件环境
- 机器型号: 昆仑芯AI加速器组R480-X8
- 加速卡型号: 昆仑芯AI加速卡R300
- 多机网络类型、带宽: InfiniBand,200Gb/s

- ##### 软件环境
- OS版本:Ubuntu 20.04
- OS kernel版本: 5.4.0-26-generic
- 加速卡驱动版本:4.0.25
- Docker镜像和版本:iregistry.baidu-int.com/xmlir/xmlir_ubuntu_2004_x86_64:v0.27
- 训练框架版本:xmlir
- 训练编译器版本:xacc
- 依赖软件版本:pytorch-2.0.1


### 运行情况

* 输入批尺寸
1. local_batchsize(micro_batchsize),简写为LBS,即实际进入模型的张量批尺寸,为config_H100x4x8.py中所写,在本case中默认为1
2. seqlength(max_position_embedding),简写为MPE,即实际进入模型的序列长度,为config_H100x4x8.py中所写,在本case中默认为4096
3. gradient_accumulate_steps,简写为GAS,即梯度累加步数,为ds_config.json中所写,在本case中默认为44
4. global_batchsize恒等于local_batchsize\*gradient_accumulate_steps\*data_parallel_size。在本case中,data_parallel_size=world_size/TPsize/PPsize。

* 通用指标

| 指标名称 | 指标值 | 特殊说明 |
| ------------ | -------------------------- | ---------------------------------- |
| 任务类别 | 自然语言理解 | |
| 模型 | llama2_70b | |
| 数据集 | pile wikipedia | |
| 数据精度 | precision,见“性能指标” | 可选fp32/amp/fp16/bf16 |
| 超参修改 | parallel,见“性能指标” | 格式为TPxPPyDPz,例如TP2PP1DP4 |
| 超参修改 | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 |
| 硬件设备简称 | nvidia H800 | |
| 硬件存储使用 | mem,见“性能指标” | 通常称为“显存”,单位为GiB |
| 计算使用率 | MFU,见“性能指标” | 参见PaLM论文定义 |
| **吞吐量** | **token/p/s,见“性能指标”** | 平均单卡每秒处理的token数 |

* 性能指标

值得注意的是,下列第4组实验的global_batchsize与llama2原始论文相同, 训练100 step,此项实验也将作为精度对齐所用实验。

| 配置 | precision | parallel | fix_hp | token/p/s | 是否精度对齐 | mem | MFU |
| ------------------- | --------- | --------- | ---------------------------- | --------- | ----- | ----- | --- |
| R300十机80卡(10x8) | fp32 | TP8PP10DP1 | / | / | / | 21/32 | / |
| R300十机80卡(10x8) | amp | TP8PP10DP1 | GAS=1024(GBS=1024=4M tokens) | / | doing* | 21/32 | / |
因缺少R300机器,在单卡R300与单卡GPU上初步验证精度。目前已通过减小模型层数的方式,在单卡R300与单卡GPU上验证精度。完整70B模型的精度验证进行中。
Loading

0 comments on commit e019f7a

Please sign in to comment.