From c7fb7a1eca829cca8ccb7fb9001cb34cd4bd8327 Mon Sep 17 00:00:00 2001
From: Stanley <290227932@qq.com>
Date: Mon, 18 Dec 2023 17:36:42 +0800
Subject: [PATCH] [kunlunxin] fix Vit and add configs (#362)

* init

* add efficientnet

* modify config

* modify config

* modify config

* add efficientnet

* modify config

* add efficientnet

* bug fix

* add efficientnet

* add efficientnet

* fix code style

* fix code style

* fix code style

* Revert "fix code style"

This reverts commit ae861095fbd0bbc31ebac82962e821f65c224ae1.

* fix code style

* fix code style

* fix code style

* fix code style

* fix code style

* bug fix

* add kunlunxin readme

* fix mobilenetv2 on kunlunxin

* add mobilenet config_R300x2x8.py

* fix mobilenetv2 on kunlunxin

* fix vit on kunlunxin

* add vit 1x8 on kunlunxin

* add vit 2x8 1x1 on kunlunxin

* add vit 2x8 1x1 on kunlunxin

* add vit 2x8 1x1 on kunlunxin

* fix code style

* fix code style

---------

Co-authored-by: Feilei Du <dufeilei@foxmail.com>
---
 training/kunlunxin/vit-pytorch/README.md      | 32 +++++++++++++++----
 .../vit-pytorch/config/config_R300x1x1.py     |  6 ++++
 .../vit-pytorch/config/config_R300x1x8.py     |  1 +
 .../vit-pytorch/config/config_R300x2x8.py     |  6 ++++
 .../config/environment_variables.sh           |  4 +++
 .../vit-pytorch/config/requirements.txt       |  2 ++
 training/run_benchmarks/config/test_conf.py   |  1 +
 7 files changed, 45 insertions(+), 7 deletions(-)
 create mode 100644 training/kunlunxin/vit-pytorch/config/config_R300x1x1.py
 create mode 100644 training/kunlunxin/vit-pytorch/config/config_R300x2x8.py
 create mode 100644 training/kunlunxin/vit-pytorch/config/environment_variables.sh
 create mode 100644 training/kunlunxin/vit-pytorch/config/requirements.txt

diff --git a/training/kunlunxin/vit-pytorch/README.md b/training/kunlunxin/vit-pytorch/README.md
index dae1d8eb9..d9f02218d 100644
--- a/training/kunlunxin/vit-pytorch/README.md
+++ b/training/kunlunxin/vit-pytorch/README.md
@@ -18,15 +18,33 @@
 
 
 ### 运行情况
-| 训练资源 | 配置文件        | 运行时长(s) | 目标精度 | 收敛精度 | Steps数 | 性能（samples/s) |
-| -------- | --------------- | ----------- | -------- | -------- | ------- | ---------------- |
-| 单机1卡  | config_R300x1x1 | /           |          | /        |         |                  |
-| 单机8卡  | config_R300x1x8 |             | 79.982   | 66.166   | 181380  |                  |
-| 两机8卡  | config_R300x2x8 | /           |          | /        |         |                  |
+* 通用指标
 
-### 收敛曲线
-![acc](acc.png)
+| 指标名称       | 指标值                                        | 特殊说明                                    |
+| -------------- | --------------------------------------------- | ------------------------------------------- |
+| 任务类别       | Image Classification && Semantic Segmantation |                                             |
+| 模型           | Vision Transformer                            |                                             |
+| 数据集         | Imagenet2012 1K                               |                                             |
+| 数据精度       | precision,见“性能指标”  | 可选fp32/amp/fp16                           |
+| 超参修改       | fix_hp,见“性能指标”     | 跑满硬件设备评测吞吐量所需特殊超参          |
+| 硬件设备简称   | R300                    |                                             |
+| 硬件存储使用   | mem,见“性能指标”        | 通常称为“显存”,单位为GiB                    |
+| 端到端时间     | e2e_time,见“性能指标”   | 总时间+Perf初始化等时间                     |
+| 总吞吐量       | p_whole,见“性能指标”    | 实际训练图片数除以总时间(performance_whole) |
+| 训练吞吐量     | p_train,见“性能指标”    | 不包含每个epoch末尾的评估部分耗时           |
+| **计算吞吐量** | **p_core,见“性能指标”** | 不包含数据IO部分的耗时(p3>p2>p1)            |
+| 训练结果       | acc,见“性能指标”        | 单位为top1分类准确率(acc1)                  |
+| 额外修改项     | 无                      |                                             |
 
+
+
+* 性能指标
+
+| 配置                | precision | fix_hp         | e2e_time | p_whole | p_train | p_core | acc    | mem       |
+| ------------------- | --------- | -------------- | -------- | ------- | ------- | ------ | ------ | --------- |
+| R300单机单卡（1x1） | fp32      | /              | /        |         |         |        | /      | 23.4/32.0 |
+| R300单机8卡（1x8）  | fp32      | bs=128,lr=0.0015 |          |         |         |        | 79.30% | 24.6/32.0 |
+| R300两机8卡（2x8）  | fp32      | bs=128,lr=0.003 | /        |         |         |        | /      | 24.0/32.0 |
 ### 许可证
 
 Apache 2.0 license。
diff --git a/training/kunlunxin/vit-pytorch/config/config_R300x1x1.py b/training/kunlunxin/vit-pytorch/config/config_R300x1x1.py
new file mode 100644
index 000000000..76958251d
--- /dev/null
+++ b/training/kunlunxin/vit-pytorch/config/config_R300x1x1.py
@@ -0,0 +1,6 @@
+from config_common import *
+
+train_batch_size = 128
+eval_batch_size = 512
+gradient_accumulation_steps = 4
+# epochs = 1
\ No newline at end of file
diff --git a/training/kunlunxin/vit-pytorch/config/config_R300x1x8.py b/training/kunlunxin/vit-pytorch/config/config_R300x1x8.py
index 55be3fdf6..5fa5c7125 100644
--- a/training/kunlunxin/vit-pytorch/config/config_R300x1x8.py
+++ b/training/kunlunxin/vit-pytorch/config/config_R300x1x8.py
@@ -2,4 +2,5 @@
 
 train_batch_size = 128
 eval_batch_size = 512
+lr = 0.003 * 0.5
 gradient_accumulation_steps = 4
diff --git a/training/kunlunxin/vit-pytorch/config/config_R300x2x8.py b/training/kunlunxin/vit-pytorch/config/config_R300x2x8.py
new file mode 100644
index 000000000..423721193
--- /dev/null
+++ b/training/kunlunxin/vit-pytorch/config/config_R300x2x8.py
@@ -0,0 +1,6 @@
+from config_common import *
+
+train_batch_size = 128
+eval_batch_size = 512
+gradient_accumulation_steps = 2
+# epochs = 8
diff --git a/training/kunlunxin/vit-pytorch/config/environment_variables.sh b/training/kunlunxin/vit-pytorch/config/environment_variables.sh
new file mode 100644
index 000000000..fcdc18321
--- /dev/null
+++ b/training/kunlunxin/vit-pytorch/config/environment_variables.sh
@@ -0,0 +1,4 @@
+export XACC=1
+export BKCL_PCIE_RING=1
+export BKCL_TIMEOUT=1800
+export XMLIR_D_XPU_L3_SIZE=66060288
diff --git a/training/kunlunxin/vit-pytorch/config/requirements.txt b/training/kunlunxin/vit-pytorch/config/requirements.txt
new file mode 100644
index 000000000..9ff33e446
--- /dev/null
+++ b/training/kunlunxin/vit-pytorch/config/requirements.txt
@@ -0,0 +1,2 @@
+https://download.pytorch.org/whl/cpu/torchvision-0.13.1%2Bcpu-cp38-cp38-linux_x86_64.whl
+tabulate
diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py
index 4ddb45b7c..aa24f57b5 100644
--- a/training/run_benchmarks/config/test_conf.py
+++ b/training/run_benchmarks/config/test_conf.py
@@ -119,6 +119,7 @@
     # "transformer_xl:pytorch:R300:1:8:1": "/raid/dataset/transformer_xl/",
     # "glm:pytorch:R300:1:8:1": "/raid/home_datasets_ckpt/glm/train/",
     # "mobilenetv2:pytorch:R300:1:8:1": "/raid/dataset/ImageNet_1k_2012/",
+    # "vit:pytorch:R300:1:8:1": "/raid/dataset/ImageNet_1k_2012/",
     # "bert:pytorch:R300:1:8:1": "/raid/dataset/bert_large/train",
     # "longformer:pytorch:R300:1:8:1": "/raid/dataset/longformer_train",
     # "distilbert:pytorch:R300:1:8:1": "/raid/dataset/distilbert/",