[metax] add glm result (FlagOpen#466)

* add bert_hf result * Update README.md 1 * add glm result * [metax] Update glm README.md
nrikoh · Mar 14, 2024 · b9f6294 · b9f6294
1 parent fb689ab
commit b9f6294
Show file tree

Hide file tree

Showing 12 changed files with 635 additions and 1 deletion.
diff --git a/training/metax/bert_hf-pytorch/README.md b/training/metax/bert_hf-pytorch/README.md
@@ -1,4 +1,4 @@
-### Nvidia GPU配置与运行信息参考
+###  沐曦集成电路 C500 GPU配置与运行信息参考
 #### 环境配置
 - ##### 硬件环境
     - 机器、加速卡型号: 曦云®C500 64G

diff --git a/training/metax/glm-pytorch/README.md b/training/metax/glm-pytorch/README.md
@@ -0,0 +1,46 @@
+### 模型Checkpoint下载
+[模型Checkpoint下载](../../benchmarks/glm/README.md#模型checkpoint)
+### 测试数据集下载
+[测试数据集下载](../../benchmarks/glm/README.md#数据集)
+
+###  沐曦集成电路 C500 GPU配置与运行信息参考
+#### 环境配置
+- ##### 硬件环境
+    - 机器、加速卡型号: 曦云®C500 64G
+    - 多机网络类型、带宽: InfiniBand，2x200 Gb/s
+- ##### 软件环境
+   - OS版本：Ubuntu 20.04.6
+   - OS kernel版本:  5.4.0-26-generic
+   - 加速卡驱动版本：2.2.0
+   - Docker 版本：24.0.7
+   - 训练框架版本：pytorch-2.0.0+mc2.18.0.8-cp38-cp38-linux_x86_64.whl
+   - 依赖软件版本：无
+
+
+### 运行情况
+* 通用指标
+
+| 指标名称         | 指标值                                       | 特殊说明                                    |
+| ---------------- | -------------------------------------------- | ------------------------------------------- |
+| 任务类别         | 自然语言理解、无条件文本生成、有条件文本生成 |                                             |
+| 模型             | GLM                                          |                                             |
+| 数据集           | superglue                                    |                                             |
+| 数据精度         | precision,见“性能指标”                       | 可选fp32/amp/fp16                           |
+| 超参修改         | fix_hp,见“性能指标”                          | 跑满硬件设备评测吞吐量所需特殊超参          |
+| 硬件设备简称     | MXC500                                  |                                             |
+| 硬件存储使用     | mem(actual/total),见“性能指标”               | 通常称为“显存”,单位为GiB                    |
+| 端到端时间       | e2e_time,见“性能指标”                        | 总时间+Perf初始化等时间                     |
+| 总吞吐量         | p_whole,见“性能指标”                         | 实际训练样本数除以总时间(performance_whole) |
+| 训练吞吐量       | p_train,见“性能指标”                         | 不包含每个epoch末尾的评估部分耗时           |
+| **计算吞吐量**   | **p_core,见“性能指标”**                      | 不包含数据IO部分的耗时(p3>p2>p1)            |
+| **计算卡使用率** | **\*MFU**                                    | model flops utilization                     |
+| 训练结果         | acc,见“性能指标”                             | 准确率                                      |
+| 额外修改项       | 无                                           |                                             |
+
+* 性能指标
+
+| 配置                | precision | fix_hp          | e2e_time | p_whole | p_train | p_core | acc   | mem       | MFU   |
+| ------------------- | --------- | --------------- | -------- | ------- | ------- | ------ | ----- | --------- | ----- |
+| C500单机8卡（1x8）  | fp32      | / |     |     |  |  | 0.802 | 54.5/64.0 |  |
+| C500单机单卡（1x1） | fp32      | / |     |    | |   | /     | 50.4/64.0 |  |
+| C500两机16卡（2x8） | fp32      | /  |     |  |    |   | /     | 29.8/64.0 | |
diff --git a/training/metax/glm-pytorch/config/config_C500x1x1.py b/training/metax/glm-pytorch/config/config_C500x1x1.py
@@ -0,0 +1,19 @@
+train_batch_size = 16
+eval_batch_size = 16
+
+max_samples_termination = 24135
+
+dist_backend = "nccl"
+
+lr = 1e-5
+weight_decay = 0.1
+adam_beta1 = 0.9
+adam_beta2 = 0.999
+adam_eps = 1e-08
+gradient_accumulation_steps = 1
+warmup = 0.1
+lr_decay_ratio = 0.1
+lr_decay_iters = 4338
+log_freq = 1
+
+training_event = None
diff --git a/training/metax/glm-pytorch/config/config_C500x1x8.py b/training/metax/glm-pytorch/config/config_C500x1x8.py
@@ -0,0 +1,18 @@
+train_batch_size = 16
+eval_batch_size = 16
+
+dist_backend = "nccl"
+
+lr = 1e-5
+weight_decay = 0.1
+adam_beta1 = 0.9
+adam_beta2 = 0.999
+adam_eps = 1e-08
+gradient_accumulation_steps = 1
+warmup = 0.1
+lr_decay_ratio = 0.1
+lr_decay_iters = 4338
+log_freq = 1
+seed = 10483
+max_samples_termination = 5553080
+training_event = None
diff --git a/training/metax/glm-pytorch/config/config_C500x2x8.py b/training/metax/glm-pytorch/config/config_C500x2x8.py
@@ -0,0 +1,22 @@
+fp16 = True
+ddp_type = "apex"
+train_batch_size = 8
+eval_batch_size = 8
+
+dist_backend = "nccl"
+
+lr = 1e-5
+weight_decay = 0.1
+adam_beta1 = 0.9
+adam_beta2 = 0.999
+adam_eps = 1e-08
+gradient_accumulation_steps = 1
+warmup = 0.1
+lr_decay_ratio = 0.1
+lr_decay_iters = 4338
+log_freq = 1
+
+training_event = None
+
+max_samples_termination = 1388270 * 4
+target_accuracy = 0.8
diff --git a/training/metax/glm-pytorch/config/requirements.txt b/training/metax/glm-pytorch/config/requirements.txt
@@ -0,0 +1,3 @@
+h5sparse
+boto3
+h5py
diff --git a/training/metax/glm-pytorch/extern/converter.py b/training/metax/glm-pytorch/extern/converter.py
@@ -0,0 +1,21 @@
+from driver import dist_pytorch
+from .layers.transformer import GLMTransformer
+
+
+def convert_model(model, config):
+    if dist_pytorch.get_rank() == 0:
+        print("use apex layer norm", flush=True)
+    state_dict = model.state_dict()
+    transformer_layer = GLMTransformer(
+        num_layers=config.num_layers,
+        hidden_size=config.hidden_size,
+        num_attention_heads=config.num_attention_heads,
+        max_sequence_length=config.max_seq_length,
+        max_memory_length=config.max_memory_length,
+        embedding_dropout_prob=config.hidden_dropout,
+        attention_dropout_prob=config.attention_dropout,
+        output_dropout_prob=config.hidden_dropout,
+        checkpoint_activations=config.checkpoint_activations)
+    model.model.transformer = transformer_layer
+    model.load_state_dict(state_dict, strict=True)
+    return model
diff --git a/training/metax/glm-pytorch/extern/layers/__init__.py b/training/metax/glm-pytorch/extern/layers/__init__.py
@@ -0,0 +1 @@
+from .transformer import *
diff --git a/training/metax/glm-pytorch/extern/layers/layernorm.py b/training/metax/glm-pytorch/extern/layers/layernorm.py
@@ -0,0 +1 @@
+from apex.normalization import FusedLayerNorm as LayerNorm
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		from apex.normalization import FusedLayerNorm as LayerNorm