add fp32 training performance data and update readme

FlagOpen · Jan 11, 2024 · 0171aca · 0171aca
1 parent 3f93915
commit 0171aca
Show file tree

Hide file tree

Showing 7 changed files with 26 additions and 199 deletions.
diff --git a/training/benchmarks/llama2_70B/megatron/megatron_main.sh b/training/benchmarks/llama2_70B/megatron/megatron_main.sh
@@ -14,7 +14,8 @@ M_BATCHSIZE=${10}
 G_BATCHSIZE=${11}
 SEQLENGTH=${12}
 FLASH_ATTN=${13}
-VENDOR_SHELL=${14}
+RECOMPUTE=${14}
+VENDOR_SHELL=${15}
 
 echo $DATA_DIR
 echo $GPUS_PER_NODE
@@ -29,6 +30,7 @@ echo $M_BATCHSIZE
 echo $G_BATCHSIZE
 echo $SEQLENGTH
 echo $FLASH_ATTN
+echo $RECOMPUTE
 echo $VENDOR_SHELL
 
 DATA_PATH=$DATA_DIR/llama_00_text_document
@@ -96,6 +98,11 @@ NETWORK_ARGS="
     --untie-embeddings-and-output-weights
 "
 
+if [ "$RECOMPUTE" = "True" ]; then
+    RECOMPUTE_ARGS="
+        --recompute-activations
+    "
+
 INITIALIZATION_ARGS="
     --init-method-std 0.02 \
     --seed 1234 
@@ -117,8 +124,12 @@ LEARNING_RATE_ARGS="
     --lr-warmup-fraction .01 
 "
 
+LOGGING_ARGS="
+    --log-interval 1
+"
+
 source $VENDOR_SHELL
-cmd="torchrun $DISTRIBUTED_ARGS pretrain_llama.py \
+cmd="torchrun $DISTRIBUTED_ARGS /workspace/FlagScale/pretrain_llama.py \
               $TRAINING_ARGS \
               $MIXED_PRECISION_ARGS \
               $DATA_ARGS \
@@ -127,6 +138,7 @@ cmd="torchrun $DISTRIBUTED_ARGS pretrain_llama.py \
               $REGULARIZATION_ARGS \
               $LEARNING_RATE_ARGS \
               $CHECKPOINTING_ARGS \
+              $RECOMPUTE_ARGS \
               $LOGGING_ARGS
     "
 echo $cmd

diff --git a/training/benchmarks/llama2_70B/megatron/pretrain_llama.py b/training/benchmarks/llama2_70B/megatron/pretrain_llama.py
diff --git a/training/benchmarks/llama2_70B/megatron/run_pretraining.py b/training/benchmarks/llama2_70B/megatron/run_pretraining.py
@@ -42,6 +42,7 @@ def parse_args():
     theoryflops = getattr(module, 'theoryflops')
     epochs = getattr(module, 'epochs')
     flashattn = getattr(module, 'flashattn')
+    recompute = getattr(module, 'recompute')
     tensor_parallel = getattr(module, 'tensor_parallel')
     pipeline_parallel = getattr(module, 'pipeline_parallel')
 
@@ -66,6 +67,7 @@ def parse_args():
     exec_cmd = exec_cmd + " " + str(gbs)
     exec_cmd = exec_cmd + " " + str(seqlength)
     exec_cmd = exec_cmd + " " + str(flashattn)
+    exec_cmd = exec_cmd + " " + str(recompute)
     exec_cmd = exec_cmd + " " + os.path.join(config_dir_path, "training_adapter.sh")
 
     with open(task_log_file, "w") as f:
@@ -87,4 +89,4 @@ def parse_args():
     chip_tps = whole_tps / (args.nproc_per_node * args.nnodes)
     print("System tokens per second: ", whole_tps)
     print("Tokens/p/s: ", chip_tps)
-    print("MFU: ", chip_tps * 7000000000.0 * 6 / theoryflops)
+    print("MFU: ", chip_tps * 70000000000.0 * 6 / theoryflops)
diff --git a/training/nvidia/docker_image/megatron/megatron_install.sh b/training/nvidia/docker_image/megatron/megatron_install.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
 # using github mirrors to avoid github TTL
-git clone -b kunlunxin_llama70B https://github.com/jamesruio/FlagScale.git
+git clone https://githubfast.com/FlagOpen/FlagScale
 echo 'export PYTHONPATH=$PYTHONPATH:/workspace/FlagScale' >> /root/.bashrc
 source /root/.bashrc
diff --git a/training/nvidia/llama2_70B-megatron/README.md b/training/nvidia/llama2_70B-megatron/README.md
@@ -40,7 +40,7 @@
 | 任务类别     | 自然语言理解               |                                    |
 | 模型         | llama2_70b                  |                                    |
 | 数据集       | pile wikipedia   |                                    |
-| 数据精度     | amp                        |                                    |
+| 数据精度       | precision,见“性能指标”  | 可选fp32/amp/fp16/bf16                      |
 | 超参修改     | parallel,见“性能指标” | 格式为TPxPPyDPz，例如TP2PP1DP4 |
 | 超参修改     | fix_hp,见“性能指标”        | 跑满硬件设备评测吞吐量所需特殊超参 |
 | 硬件设备简称 | nvidia H800                |                                    |
@@ -50,7 +50,8 @@
 
 * 性能指标
 
-| 配置                | parallel |  fix_hp           | token/p/s | loss | mem       | MFU       |
-| ------------------- | ------ | ---------------- | ------ | ------- | --------- | --------- |
-| H800单机8卡（4x8）  | TP8PP4DP1 |  /                | 641.93 | 5.7 | 62/80 | 27.2% |
-| H800单机8卡（4x8）  | TP4PP8DP1 |  /            | 791.37 | 5.6 | 74/80 | 33.6% |
+| 配置                |  precision | parallel |  fix_hp           | token/p/s | loss | mem       | MFU       |
+| ------------------ | -------- | --------- | ---------------- | ------ | ------- | --------- | --------- |
+| H800单机8卡（4x8）  |    fp32    | TP8PP4DP1 |  recompute=True | 253.61 | 0.94 | 77/80 | 10.7% |
+| H800单机8卡（4x8）  |     amp    | TP8PP4DP1 |  /              | 641.93 | 5.7 | 62/80 | 27.2% |
+| H800单机8卡（4x8）  |    amp     | TP4PP8DP1 |  /              | 791.37 | 5.6 | 74/80 | 33.6% |
diff --git a/training/nvidia/llama2_70B-megatron/config/config_H800x4x8.py b/training/nvidia/llama2_70B-megatron/config/config_H800x4x8.py
@@ -5,5 +5,6 @@
 theoryflops = 989000000000000.0
 epochs = 1
 flashattn = True
+recompute = False
 tensor_parallel = 8
 pipeline_parallel = 4
diff --git a/training/run_benchmarks/config/cluster_conf.py b/training/run_benchmarks/config/cluster_conf.py
@@ -1,7 +1,7 @@
 '''Cluster configs'''
 
 # Hosts to run the benchmark. Each item is an IP address or a hostname.
-HOSTS = ["192.2.32.13", "192.2.32.14", "192.2.32.2", "192.2.32.4"]
+HOSTS = ["10.1.2.2", "10.1.2.3", "10.1.2.4"]
 
 # Hosts port to run the tensorflow distribution_strategy = 'multi_worker_mirrored'
 HOSTS_PORTS = ["2222"]