-
Notifications
You must be signed in to change notification settings - Fork 109
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
41 changed files
with
1,219 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
def analysis_log(logpath): | ||
logfile = open(logpath) | ||
|
||
max_usage = 0.0 | ||
max_mem = 0.0 | ||
for line in logfile.readlines(): | ||
if "MiB" in line: | ||
|
||
usage_and_maxusage = line.split(" ")[2] | ||
usage = float(usage_and_maxusage.split("/")[0]) | ||
max_usage = max(max_usage, usage) | ||
max_mem = float(usage_and_maxusage.split("/")[1]) | ||
#max_mem = float(max_mem[:-3]) | ||
print (max_mem) | ||
print (max_usage) | ||
return round(max_usage / 1024.0, | ||
2), round(max_mem / 1024.0, 2), eval("120e12"), eval("240e12") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,256 @@ | ||
# !/usr/bin/env python3 | ||
# encoding: utf-8 | ||
''' | ||
Usage: python3 sys-monitor.py -o operation -l [log_path] | ||
-o, --operation start|stop|restart|status | ||
-l, --log log path , ./logs/ default | ||
''' | ||
|
||
import os | ||
import sys | ||
import time | ||
import signal | ||
import atexit | ||
import argparse | ||
import datetime | ||
from multiprocessing import Process | ||
import subprocess | ||
import schedule | ||
|
||
|
||
class Daemon: | ||
''' | ||
daemon subprocess class. | ||
usage: subclass this daemon and override the run() method. | ||
sys-monitor.pid: in the /tmp/, auto del when unexpected exit. | ||
verbose: debug mode, disabled default. | ||
''' | ||
|
||
def __init__(self, | ||
pid_file, | ||
log_file, | ||
err_file, | ||
gpu_log, | ||
log_path, | ||
rate=5, | ||
stdin=os.devnull, | ||
stdout=os.devnull, | ||
stderr=os.devnull, | ||
home_dir='.', | ||
umask=0o22, | ||
verbose=0): | ||
self.stdin = stdin | ||
self.stdout = stdout | ||
self.stderr = stderr | ||
self.home_dir = home_dir | ||
self.verbose = verbose | ||
self.pidfile = pid_file | ||
self.logfile = log_file | ||
self.errfile = err_file | ||
self.gpufile = gpu_log | ||
self.logpath = log_path | ||
self.rate = rate | ||
self.umask = umask | ||
self.verbose = verbose | ||
self.daemon_alive = True | ||
|
||
def get_pid(self): | ||
try: | ||
with open(self.pidfile, 'r') as pf: | ||
pid = int(pf.read().strip()) | ||
except IOError: | ||
pid = None | ||
except SystemExit: | ||
pid = None | ||
return pid | ||
|
||
def del_pid(self): | ||
if os.path.exists(self.pidfile): | ||
os.remove(self.pidfile) | ||
|
||
def run(self): | ||
''' | ||
NOTE: override the method in subclass | ||
''' | ||
|
||
def gpu_mon(file): | ||
TIMESTAMP = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S') | ||
cmd = "mx-smi |grep 'W' -m 1 | awk '{print $2, $3, $5,$6}' && mx-smi |grep 'MXC' -m 1 | awk '{print $7}'" | ||
process = subprocess.Popen(cmd, | ||
shell=True, | ||
stdout=subprocess.PIPE, | ||
stderr=subprocess.STDOUT, | ||
encoding='utf-8') | ||
try: | ||
out = process.communicate(timeout=10) | ||
except subprocess.TimeoutExpired: | ||
process.kill() | ||
out = process.communicate() | ||
|
||
if process.returncode != 0: | ||
result = "error" | ||
result = TIMESTAMP + "\n" + out[0] + "\n" | ||
with open(file, 'a') as f: | ||
f.write(result) | ||
|
||
def timer_gpu_mon(): | ||
gpu_process = Process(target=gpu_mon, args=(self.gpufile, )) | ||
gpu_process.start() | ||
|
||
schedule.every(self.rate).seconds.do(timer_gpu_mon) | ||
while True: | ||
schedule.run_pending() | ||
time.sleep(5) | ||
|
||
def daemonize(self): | ||
if self.verbose >= 1: | ||
print('daemon process starting ...') | ||
try: | ||
pid = os.fork() | ||
if pid > 0: | ||
sys.exit(0) | ||
except OSError as e: | ||
sys.stderr.write('fork #1 failed: %d (%s)\n' % | ||
(e.errno, e.strerror)) | ||
sys.exit(1) | ||
os.chdir(self.home_dir) | ||
os.setsid() | ||
os.umask(self.umask) | ||
try: | ||
pid = os.fork() | ||
if pid > 0: | ||
sys.exit(0) | ||
except OSError as e: | ||
sys.stderr.write('fork #2 failed: %d (%s)\n' % | ||
(e.errno, e.strerror)) | ||
sys.exit(1) | ||
sys.stdout.flush() | ||
sys.stderr.flush() | ||
si = open(self.stdin, 'r') | ||
so = open(self.stdout, 'a+') | ||
if self.stderr: | ||
se = open(self.stderr, 'a+') | ||
else: | ||
se = so | ||
os.dup2(si.fileno(), sys.stdin.fileno()) | ||
os.dup2(so.fileno(), sys.stdout.fileno()) | ||
os.dup2(se.fileno(), sys.stderr.fileno()) | ||
atexit.register(self.del_pid) | ||
pid = str(os.getpid()) | ||
with open(self.pidfile, 'w+') as f: | ||
f.write('%s\n' % pid) | ||
|
||
def start(self): | ||
if not os.path.exists(self.logpath): | ||
os.makedirs(self.logpath) | ||
elif os.path.exists(self.gpufile): | ||
os.remove(self.gpufile) | ||
if self.verbose >= 1: | ||
print('ready to start ......') | ||
# check for a pid file to see if the daemon already runs | ||
pid = self.get_pid() | ||
if pid: | ||
msg = 'pid file %s already exists, is it already running?\n' | ||
sys.stderr.write(msg % self.pidfile) | ||
sys.exit(1) | ||
# start the daemon | ||
self.daemonize() | ||
self.run() | ||
|
||
def stop(self): | ||
if self.verbose >= 1: | ||
print('stopping ...') | ||
pid = self.get_pid() | ||
if not pid: | ||
msg = 'pid file [%s] does not exist. Not running?\n' % self.pidfile | ||
sys.stderr.write(msg) | ||
if os.path.exists(self.pidfile): | ||
os.remove(self.pidfile) | ||
return | ||
# try to kill the daemon process | ||
try: | ||
i = 0 | ||
while 1: | ||
os.kill(pid, signal.SIGTERM) | ||
time.sleep(1) | ||
i = i + 1 | ||
if i % 10 == 0: | ||
os.kill(pid, signal.SIGHUP) | ||
except OSError as err: | ||
err = str(err) | ||
if err.find('No such process') > 0: | ||
if os.path.exists(self.pidfile): | ||
os.remove(self.pidfile) | ||
else: | ||
print(str(err)) | ||
sys.exit(1) | ||
if self.verbose >= 1: | ||
print('Stopped!') | ||
|
||
def restart(self): | ||
self.stop() | ||
self.start() | ||
|
||
def status(self): | ||
pid = self.get_pid() | ||
if pid: | ||
if os.path.exists('/proc/%d' % pid): | ||
return pid | ||
return False | ||
|
||
|
||
def parse_args(): | ||
''' Check script input parameter. ''' | ||
parse = argparse.ArgumentParser(description='Sys monitor script') | ||
parse.add_argument('-o', | ||
type=str, | ||
metavar='[operation]', | ||
required=True, | ||
help='start|stop|restart|status') | ||
parse.add_argument('-l', | ||
type=str, | ||
metavar='[log_path]', | ||
required=False, | ||
default='./logs/', | ||
help='log path') | ||
args = parse.parse_args() | ||
return args | ||
|
||
|
||
def main(): | ||
sample_rate1 = 5 | ||
args = parse_args() | ||
operation = args.o | ||
log_path = args.l | ||
pid_fn = str('/tmp/gpu_monitor.pid') | ||
log_fn = str(log_path + '/metax_monitor.log') | ||
err_fn = str(log_path + '/metax_monitor.err') | ||
# result for gpu | ||
gpu_fn = str(log_path + '/metax_monitor.log') | ||
|
||
subdaemon = Daemon(pid_fn, | ||
log_fn, | ||
err_fn, | ||
gpu_fn, | ||
log_path, | ||
verbose=1, | ||
rate=sample_rate1) | ||
if operation == 'start': | ||
subdaemon.start() | ||
elif operation == 'stop': | ||
subdaemon.stop() | ||
elif operation == 'restart': | ||
subdaemon.restart() | ||
elif operation == 'status': | ||
pid = subdaemon.status() | ||
if pid: | ||
print('process [%s] is running ......' % pid) | ||
else: | ||
print('daemon process [%s] stopped' % pid) | ||
else: | ||
print("invalid argument!") | ||
sys.exit(1) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
FROM mxcr.io/library/maca-c500-pytorch:2.19.2.5-ubuntu18.04-amd64 | ||
ENV PATH="/opt/conda/bin:${PATH}" | ||
ENV PYTORCH_USE_FLASHATTN=1 | ||
RUN pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple | ||
RUN pip install numpy | ||
RUN pip install pyyaml | ||
RUN pip install timm | ||
RUN pip install munch | ||
RUN pip install schedule | ||
RUN pip install loguru | ||
RUN /bin/bash -c "uname -a" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
FROM iregistry.baidu-int.com/xmlir/xmlir_ubuntu_2004_x86_64:v0.27 | ||
RUN /bin/bash -c "pip config set global.index-url https://mirror.baidu.com/pypi/simple" | ||
RUN /bin/bash -c "uname -a" | ||
RUN /bin/bash -c alias python3=python | ||
|
||
ENV PATH /root/miniconda/envs/python38_torch201_cuda/bin:$PATH |
14 changes: 14 additions & 0 deletions
14
training/kunlunxin/docker_image/megatron/megatron_install.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
#!/bin/bash | ||
# using github mirrors to avoid github TTL | ||
#export https_proxy=http://10.1.0.34:7890 | ||
git clone https://githubfast.com/FlagOpen/FlagScale | ||
cd FlagScale | ||
|
||
git checkout eb0438a5459404e2e4c70b15fa37e9a197ab159d | ||
echo 'export PYTHONPATH=$PYTHONPATH:/home/FlagScale' >> /root/.bashrc | ||
source /root/.bashrc | ||
|
||
wget https://bd.bcebos.com/v1/klx-pytorch-work-bd/training/zhangling21_llama70B/xmlir201_5.run | ||
bash xmlir201_5.run | ||
XFLAGS --enable transformer_engine | ||
XFLAGS --enable flagscale |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
### 昆仑芯XPU配置与运行信息参考 | ||
#### 环境配置 | ||
- ##### 硬件环境 | ||
- 机器型号: 昆仑芯AI加速器组R480-X8 | ||
- 加速卡型号: 昆仑芯AI加速卡R300 | ||
- 多机网络类型、带宽: InfiniBand,200Gb/s | ||
|
||
- ##### 软件环境 | ||
- OS版本:Ubuntu 20.04 | ||
- OS kernel版本: 5.4.0-26-generic | ||
- 加速卡驱动版本:4.0.25 | ||
- Docker镜像和版本:iregistry.baidu-int.com/xmlir/xmlir_ubuntu_2004_x86_64:v0.27 | ||
- 训练框架版本:xmlir | ||
- 训练编译器版本:xacc | ||
- 依赖软件版本:pytorch-2.0.1 | ||
|
||
|
||
### 运行情况 | ||
|
||
* 输入批尺寸 | ||
1. local_batchsize(micro_batchsize),简写为LBS,即实际进入模型的张量批尺寸,为config_H100x4x8.py中所写,在本case中默认为1 | ||
2. seqlength(max_position_embedding),简写为MPE,即实际进入模型的序列长度,为config_H100x4x8.py中所写,在本case中默认为4096 | ||
3. gradient_accumulate_steps,简写为GAS,即梯度累加步数,为ds_config.json中所写,在本case中默认为44 | ||
4. global_batchsize恒等于local_batchsize\*gradient_accumulate_steps\*data_parallel_size。在本case中,data_parallel_size=world_size/TPsize/PPsize。 | ||
|
||
* 通用指标 | ||
|
||
| 指标名称 | 指标值 | 特殊说明 | | ||
| ------------ | -------------------------- | ---------------------------------- | | ||
| 任务类别 | 自然语言理解 | | | ||
| 模型 | llama2_70b | | | ||
| 数据集 | pile wikipedia | | | ||
| 数据精度 | precision,见“性能指标” | 可选fp32/amp/fp16/bf16 | | ||
| 超参修改 | parallel,见“性能指标” | 格式为TPxPPyDPz,例如TP2PP1DP4 | | ||
| 超参修改 | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 | | ||
| 硬件设备简称 | nvidia H800 | | | ||
| 硬件存储使用 | mem,见“性能指标” | 通常称为“显存”,单位为GiB | | ||
| 计算使用率 | MFU,见“性能指标” | 参见PaLM论文定义 | | ||
| **吞吐量** | **token/p/s,见“性能指标”** | 平均单卡每秒处理的token数 | | ||
|
||
* 性能指标 | ||
|
||
值得注意的是,下列第4组实验的global_batchsize与llama2原始论文相同, 训练100 step,此项实验也将作为精度对齐所用实验。 | ||
|
||
| 配置 | precision | parallel | fix_hp | token/p/s | 是否精度对齐 | mem | MFU | | ||
| ------------------- | --------- | --------- | ---------------------------- | --------- | ----- | ----- | --- | | ||
| R300十机80卡(10x8) | fp32 | TP8PP10DP1 | / | / | / | 21/32 | / | | ||
| R300十机80卡(10x8) | amp | TP8PP10DP1 | GAS=1024(GBS=1024=4M tokens) | / | doing* | 21/32 | / | | ||
因缺少R300机器,在单卡R300与单卡GPU上初步验证精度。目前已通过减小模型层数的方式,在单卡R300与单卡GPU上验证精度。完整70B模型的精度验证进行中。 |
Oops, something went wrong.