Skip to content

Commit

Permalink
add fp32 training performance data and update readme
Browse files Browse the repository at this point in the history
  • Loading branch information
jamesruio committed Jan 11, 2024
1 parent 3f93915 commit 0171aca
Show file tree
Hide file tree
Showing 7 changed files with 26 additions and 199 deletions.
16 changes: 14 additions & 2 deletions training/benchmarks/llama2_70B/megatron/megatron_main.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ M_BATCHSIZE=${10}
G_BATCHSIZE=${11}
SEQLENGTH=${12}
FLASH_ATTN=${13}
VENDOR_SHELL=${14}
RECOMPUTE=${14}
VENDOR_SHELL=${15}

echo $DATA_DIR
echo $GPUS_PER_NODE
Expand All @@ -29,6 +30,7 @@ echo $M_BATCHSIZE
echo $G_BATCHSIZE
echo $SEQLENGTH
echo $FLASH_ATTN
echo $RECOMPUTE
echo $VENDOR_SHELL

DATA_PATH=$DATA_DIR/llama_00_text_document
Expand Down Expand Up @@ -96,6 +98,11 @@ NETWORK_ARGS="
--untie-embeddings-and-output-weights
"

if [ "$RECOMPUTE" = "True" ]; then
RECOMPUTE_ARGS="
--recompute-activations
"

INITIALIZATION_ARGS="
--init-method-std 0.02 \
--seed 1234
Expand All @@ -117,8 +124,12 @@ LEARNING_RATE_ARGS="
--lr-warmup-fraction .01
"

LOGGING_ARGS="
--log-interval 1
"

source $VENDOR_SHELL
cmd="torchrun $DISTRIBUTED_ARGS pretrain_llama.py \
cmd="torchrun $DISTRIBUTED_ARGS /workspace/FlagScale/pretrain_llama.py \
$TRAINING_ARGS \
$MIXED_PRECISION_ARGS \
$DATA_ARGS \
Expand All @@ -127,6 +138,7 @@ cmd="torchrun $DISTRIBUTED_ARGS pretrain_llama.py \
$REGULARIZATION_ARGS \
$LEARNING_RATE_ARGS \
$CHECKPOINTING_ARGS \
$RECOMPUTE_ARGS \
$LOGGING_ARGS
"
echo $cmd
Expand Down
189 changes: 0 additions & 189 deletions training/benchmarks/llama2_70B/megatron/pretrain_llama.py

This file was deleted.

4 changes: 3 additions & 1 deletion training/benchmarks/llama2_70B/megatron/run_pretraining.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def parse_args():
theoryflops = getattr(module, 'theoryflops')
epochs = getattr(module, 'epochs')
flashattn = getattr(module, 'flashattn')
recompute = getattr(module, 'recompute')
tensor_parallel = getattr(module, 'tensor_parallel')
pipeline_parallel = getattr(module, 'pipeline_parallel')

Expand All @@ -66,6 +67,7 @@ def parse_args():
exec_cmd = exec_cmd + " " + str(gbs)
exec_cmd = exec_cmd + " " + str(seqlength)
exec_cmd = exec_cmd + " " + str(flashattn)
exec_cmd = exec_cmd + " " + str(recompute)
exec_cmd = exec_cmd + " " + os.path.join(config_dir_path, "training_adapter.sh")

with open(task_log_file, "w") as f:
Expand All @@ -87,4 +89,4 @@ def parse_args():
chip_tps = whole_tps / (args.nproc_per_node * args.nnodes)
print("System tokens per second: ", whole_tps)
print("Tokens/p/s: ", chip_tps)
print("MFU: ", chip_tps * 7000000000.0 * 6 / theoryflops)
print("MFU: ", chip_tps * 70000000000.0 * 6 / theoryflops)
2 changes: 1 addition & 1 deletion training/nvidia/docker_image/megatron/megatron_install.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash
# using github mirrors to avoid github TTL
git clone -b kunlunxin_llama70B https://github.com/jamesruio/FlagScale.git
git clone https://githubfast.com/FlagOpen/FlagScale
echo 'export PYTHONPATH=$PYTHONPATH:/workspace/FlagScale' >> /root/.bashrc
source /root/.bashrc
11 changes: 6 additions & 5 deletions training/nvidia/llama2_70B-megatron/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
| 任务类别 | 自然语言理解 | |
| 模型 | llama2_70b | |
| 数据集 | pile wikipedia | |
| 数据精度 | amp | |
| 数据精度 | precision,见“性能指标” | 可选fp32/amp/fp16/bf16 |
| 超参修改 | parallel,见“性能指标” | 格式为TPxPPyDPz,例如TP2PP1DP4 |
| 超参修改 | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 |
| 硬件设备简称 | nvidia H800 | |
Expand All @@ -50,7 +50,8 @@

* 性能指标

| 配置 | parallel | fix_hp | token/p/s | loss | mem | MFU |
| ------------------- | ------ | ---------------- | ------ | ------- | --------- | --------- |
| H800单机8卡(4x8) | TP8PP4DP1 | / | 641.93 | 5.7 | 62/80 | 27.2% |
| H800单机8卡(4x8) | TP4PP8DP1 | / | 791.37 | 5.6 | 74/80 | 33.6% |
| 配置 | precision | parallel | fix_hp | token/p/s | loss | mem | MFU |
| ------------------ | -------- | --------- | ---------------- | ------ | ------- | --------- | --------- |
| H800单机8卡(4x8) | fp32 | TP8PP4DP1 | recompute=True | 253.61 | 0.94 | 77/80 | 10.7% |
| H800单机8卡(4x8) | amp | TP8PP4DP1 | / | 641.93 | 5.7 | 62/80 | 27.2% |
| H800单机8卡(4x8) | amp | TP4PP8DP1 | / | 791.37 | 5.6 | 74/80 | 33.6% |
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,6 @@
theoryflops = 989000000000000.0
epochs = 1
flashattn = True
recompute = False
tensor_parallel = 8
pipeline_parallel = 4
2 changes: 1 addition & 1 deletion training/run_benchmarks/config/cluster_conf.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
'''Cluster configs'''

# Hosts to run the benchmark. Each item is an IP address or a hostname.
HOSTS = ["192.2.32.13", "192.2.32.14", "192.2.32.2", "192.2.32.4"]
HOSTS = ["10.1.2.2", "10.1.2.3", "10.1.2.4"]

# Hosts port to run the tensorflow distribution_strategy = 'multi_worker_mirrored'
HOSTS_PORTS = ["2222"]
Expand Down

0 comments on commit 0171aca

Please sign in to comment.