reiase · reiase · Aug 31, 2023 · Aug 31, 2023 · Aug 31, 2023 · Sep 1, 2023
diff --git a/training/benchmarks/cpm/pytorch/model/fp16/fp16.py b/training/benchmarks/cpm/pytorch/model/fp16/fp16.py
@@ -201,7 +201,7 @@ def __init__(self,
             fp32_from_fp16_params_this_group = []
             for i, param in enumerate(param_group['params']):
                 if param.requires_grad:
-                    if param.type() == 'torch.cuda.HalfTensor':
+                    if param.type() == 'torch.cuda.HalfTensor' or param.type() == 'torch.xpu.HalfTensor':
                         self.maybe_print(
                             "FP16_Optimizer received torch.cuda.HalfTensor with {}"
                             .format(param.size()))
@@ -217,7 +217,7 @@ def __init__(self,
                         if param in self.optimizer.state:
                             self.optimizer.state[
                                 master_param] = self.optimizer.state.pop(param)
-                    elif param.type() == 'torch.cuda.FloatTensor':
+                    elif param.type() == 'torch.cuda.FloatTensor' or param.type() == 'torch.xpu.FloatTensor':
                         self.maybe_print(
                             "FP16_Optimizer received torch.cuda.FloatTensor with {}"
                             .format(param.size()))

diff --git a/training/kunlunxin/cpm-pytorch/README.md b/training/kunlunxin/cpm-pytorch/README.md
@@ -1,9 +1,12 @@
 ### 模型Checkpoint下载
-[模型Checkpoint下载](../../benchmarks/cpm/README.md#模型checkpoint)
+参见[模型Checkpoint下载](../../benchmarks/cpm/README.md#模型checkpoint)
+
+
 ### 测试数据集下载
-[测试数据集下载](../../benchmarks/cpm/README.md#数据集)
+参见[测试数据集下载](../../benchmarks/cpm/README.md#测试数据集下载地址)
 
-### 昆仑芯XPU配置与运行信息参考
+
+### 昆仑芯 XPU 配置与运行信息参考
 #### 环境配置
 - ##### 硬件环境
   - 机器型号: 昆仑芯AI加速器组R480-X8
@@ -19,16 +22,30 @@
   - 训练编译器版本：xacc+111e7d45 [xacc下载](https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/archives/111e7d45/xacc-0.1.0-cp38-cp38-linux_x86_64.whl)
   - 依赖软件版本：pytorch-1.12.1+cpu
 
+#### 运行情况
+
+* 通用指标
 
-### 运行情况
-| 训练资源 | 配置文件        | 运行时长(s) | 目标精度 | 收敛精度 | Steps数 | 性能（samples/s) |
-| -------- | --------------- | ----------- | -------- | -------- | ------- | ---------------- |
-| 单机1卡  | config_R300x1x1 |      |       |    |     |             |
-| 单机2卡  | config_R300x1x2 |      |       |    |     |             |
-| 单机4卡  | config_R300x1x4 |      |       |    |     |             |
-| 单机8卡  | config_R300x1x8 |      |  0.92   |  0.9235  |   632  |           |
-| 两机8卡  | config_R300x2x8 |      |       |    |     |             |
+| 指标名称       | 指标值                         | 特殊说明                                    |
+| -------------- | ------------------------------ | ------------------------------------------- |
+| 任务类别       | 文本分类、文本生成             |                                             |
+| 模型           | cpm                            |                                             |
+| 数据集         | CPM-Finetune-data              |                                             |
+| 数据精度       | precision,见“性能指标”         | 可选fp32/amp/fp16                           |
+| 超参修改       | fix_hp,见“性能指标”            | 跑满硬件设备评测吞吐量所需特殊超参          |
+| 硬件设备简称   | R300                          |                                             |
+| 硬件存储使用   | mem(actual/total),见“性能指标” | 通常称为“显存”,单位为GiB                    |
+| 端到端时间     | e2e_time,见“性能指标”          | 总时间+Perf初始化等时间                     |
+| 总吞吐量       | p_whole,见“性能指标”           | 实际训练样本数除以总时间(performance_whole) |
+| 训练吞吐量     | p_train,见“性能指标”           | 不包含每个epoch末尾的评估部分耗时           |
+| **计算吞吐量** | **p_core,见“性能指标”**        | 不包含数据IO部分的耗时(p3>p2>p1)            |
+| 训练结果       | acc,见“性能指标”               | 分类准确率(mlm_accuracy)                    |
+| 额外修改项     | 无                             |                                             |
 
-### 许可证
+* 性能指标
 
-Apache 2.0 license。
+| 配置                | precision | fix_hp           | e2e_time | p_whole | p_train | p_core | acc   | mem       |
+| ------------------- | --------- | ---------------- | -------- | ------- | ------- | ------ | ----- | --------- |
+| R300单机单卡（1x1） | fp16      | bs=64,lr=0.0005    |          |         |         |         |       |         |
+| R300单机8卡（1x8）  | fp16      | bs=64,lr=0.0005   |           |         |         |         |  0.9261|  18.25/32.0 |
+| R300两机8卡（2x8）  | fp16      | bs=64,lr=0.0005   |           |         |         |         |       |  |
diff --git a/training/kunlunxin/cpm-pytorch/config/config_R300x1x1.py b/training/kunlunxin/cpm-pytorch/config/config_R300x1x1.py
@@ -0,0 +1,11 @@
+from config_common import *
+
+train_batch_size = 64
+eval_batch_size = train_batch_size
+max_steps = 700
+max_samples_termination = 4391260
+
+warmup = 0.2
+learning_rate = 0.0005
+
+seed = 23333
diff --git a/training/kunlunxin/cpm-pytorch/config/config_R300x1x8.py b/training/kunlunxin/cpm-pytorch/config/config_R300x1x8.py
@@ -1,8 +1,6 @@
 from config_common import *
 
-dist_backend = "xccl"
-
-train_batch_size = 32
+train_batch_size = 64
 eval_batch_size = train_batch_size
 max_steps = 4000
 max_samples_termination = 4391260
@@ -11,3 +9,4 @@
 learning_rate = 0.0005
 
 seed = 23333
+eval_interval_samples = 512
diff --git a/training/kunlunxin/cpm-pytorch/config/config_R300x2x8.py b/training/kunlunxin/cpm-pytorch/config/config_R300x2x8.py
@@ -0,0 +1,11 @@
+from config_common import *
+
+train_batch_size = 64
+eval_batch_size = train_batch_size
+max_steps = 700
+max_samples_termination = 4391260
+
+warmup = 0.2
+learning_rate = 0.0005
+
+seed = 23333
diff --git a/training/kunlunxin/cpm-pytorch/config/config_common.py b/training/kunlunxin/cpm-pytorch/config/config_common.py
@@ -1,7 +1,8 @@
 # DDP type: 'apex' or 'native'.
 ddp_type: str = "native"
+dist_backend = "xccl"
 
 # disable fp16
-fp16 = False
+fp16 = True
 
 vendor = 'kunlunxin'
diff --git a/training/kunlunxin/cpm-pytorch/config/environment_variables.sh b/training/kunlunxin/cpm-pytorch/config/environment_variables.sh
@@ -16,7 +16,9 @@ touch topo_file
 export XPUSIM_TOPOLOGY_FILE=$(readlink -f $topo_file)
 
 ## workaround due to ccix bug
-export BKCL_CCIX_RING="1"
 export ALLREDUCE_ASYNC="0"
 export ALLREDUCE_FUSION="0"
+export BKCL_FORCE_SYNC=1
 
+export XACC_ENABLE=1
+export XMLIR_D_XPU_L3_SIZE=32505856
diff --git a/training/kunlunxin/cpm-pytorch/extern/trainer_adapter.py b/training/kunlunxin/cpm-pytorch/extern/trainer_adapter.py
@@ -3,8 +3,10 @@
 from torch import nn
 from torch.optim import Optimizer
 from typing import Tuple
-from driver.dist_pytorch import main_proc_print
 
+from driver.dist_pytorch import main_proc_print
+from model.fp16 import FP16_Module
+from model.fp16 import FP16_Optimizer
 
 def convert_model(config, model: nn.Module) -> nn.Module:
     return model
@@ -22,4 +24,18 @@ def create_optimizer(config, model):
 
 def model_to_fp16(config, model: nn.Module,
                   optimizer: Optimizer) -> Tuple[nn.Module, Optimizer]:
+    args = config
+    if args.fp16:
+        model = FP16_Module(model)
+        optimizer = FP16_Optimizer(optimizer,
+                                static_loss_scale=args.loss_scale,
+                                dynamic_loss_scale=args.dynamic_loss_scale,
+                                dynamic_loss_args={
+                                    'scale_window': args.loss_scale_window,
+                                    'min_scale': args.min_scale,
+                                    'delayed_shift': args.hysteresis
+                                })
+        for layer in model.modules():
+            if isinstance(layer, nn.modules.normalization.LayerNorm):
+                layer = layer.float()
     return model, optimizer