From c7fb7a1eca829cca8ccb7fb9001cb34cd4bd8327 Mon Sep 17 00:00:00 2001 From: Stanley <290227932@qq.com> Date: Mon, 18 Dec 2023 17:36:42 +0800 Subject: [PATCH] [kunlunxin] fix Vit and add configs (#362) * init * add efficientnet * modify config * modify config * modify config * add efficientnet * modify config * add efficientnet * bug fix * add efficientnet * add efficientnet * fix code style * fix code style * fix code style * Revert "fix code style" This reverts commit ae861095fbd0bbc31ebac82962e821f65c224ae1. * fix code style * fix code style * fix code style * fix code style * fix code style * bug fix * add kunlunxin readme * fix mobilenetv2 on kunlunxin * add mobilenet config_R300x2x8.py * fix mobilenetv2 on kunlunxin * fix vit on kunlunxin * add vit 1x8 on kunlunxin * add vit 2x8 1x1 on kunlunxin * add vit 2x8 1x1 on kunlunxin * add vit 2x8 1x1 on kunlunxin * fix code style * fix code style --------- Co-authored-by: Feilei Du --- training/kunlunxin/vit-pytorch/README.md | 32 +++++++++++++++---- .../vit-pytorch/config/config_R300x1x1.py | 6 ++++ .../vit-pytorch/config/config_R300x1x8.py | 1 + .../vit-pytorch/config/config_R300x2x8.py | 6 ++++ .../config/environment_variables.sh | 4 +++ .../vit-pytorch/config/requirements.txt | 2 ++ training/run_benchmarks/config/test_conf.py | 1 + 7 files changed, 45 insertions(+), 7 deletions(-) create mode 100644 training/kunlunxin/vit-pytorch/config/config_R300x1x1.py create mode 100644 training/kunlunxin/vit-pytorch/config/config_R300x2x8.py create mode 100644 training/kunlunxin/vit-pytorch/config/environment_variables.sh create mode 100644 training/kunlunxin/vit-pytorch/config/requirements.txt diff --git a/training/kunlunxin/vit-pytorch/README.md b/training/kunlunxin/vit-pytorch/README.md index dae1d8eb9..d9f02218d 100644 --- a/training/kunlunxin/vit-pytorch/README.md +++ b/training/kunlunxin/vit-pytorch/README.md @@ -18,15 +18,33 @@ ### 运行情况 -| 训练资源 | 配置文件 | 运行时长(s) | 目标精度 | 收敛精度 | Steps数 | 性能(samples/s) | -| -------- | --------------- | ----------- | -------- | -------- | ------- | ---------------- | -| 单机1卡 | config_R300x1x1 | / | | / | | | -| 单机8卡 | config_R300x1x8 | | 79.982 | 66.166 | 181380 | | -| 两机8卡 | config_R300x2x8 | / | | / | | | +* 通用指标 -### 收敛曲线 -![acc](acc.png) +| 指标名称 | 指标值 | 特殊说明 | +| -------------- | --------------------------------------------- | ------------------------------------------- | +| 任务类别 | Image Classification && Semantic Segmantation | | +| 模型 | Vision Transformer | | +| 数据集 | Imagenet2012 1K | | +| 数据精度 | precision,见“性能指标” | 可选fp32/amp/fp16 | +| 超参修改 | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 | +| 硬件设备简称 | R300 | | +| 硬件存储使用 | mem,见“性能指标” | 通常称为“显存”,单位为GiB | +| 端到端时间 | e2e_time,见“性能指标” | 总时间+Perf初始化等时间 | +| 总吞吐量 | p_whole,见“性能指标” | 实际训练图片数除以总时间(performance_whole) | +| 训练吞吐量 | p_train,见“性能指标” | 不包含每个epoch末尾的评估部分耗时 | +| **计算吞吐量** | **p_core,见“性能指标”** | 不包含数据IO部分的耗时(p3>p2>p1) | +| 训练结果 | acc,见“性能指标” | 单位为top1分类准确率(acc1) | +| 额外修改项 | 无 | | + + +* 性能指标 + +| 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | acc | mem | +| ------------------- | --------- | -------------- | -------- | ------- | ------- | ------ | ------ | --------- | +| R300单机单卡(1x1) | fp32 | / | / | | | | / | 23.4/32.0 | +| R300单机8卡(1x8) | fp32 | bs=128,lr=0.0015 | | | | | 79.30% | 24.6/32.0 | +| R300两机8卡(2x8) | fp32 | bs=128,lr=0.003 | / | | | | / | 24.0/32.0 | ### 许可证 Apache 2.0 license。 diff --git a/training/kunlunxin/vit-pytorch/config/config_R300x1x1.py b/training/kunlunxin/vit-pytorch/config/config_R300x1x1.py new file mode 100644 index 000000000..76958251d --- /dev/null +++ b/training/kunlunxin/vit-pytorch/config/config_R300x1x1.py @@ -0,0 +1,6 @@ +from config_common import * + +train_batch_size = 128 +eval_batch_size = 512 +gradient_accumulation_steps = 4 +# epochs = 1 \ No newline at end of file diff --git a/training/kunlunxin/vit-pytorch/config/config_R300x1x8.py b/training/kunlunxin/vit-pytorch/config/config_R300x1x8.py index 55be3fdf6..5fa5c7125 100644 --- a/training/kunlunxin/vit-pytorch/config/config_R300x1x8.py +++ b/training/kunlunxin/vit-pytorch/config/config_R300x1x8.py @@ -2,4 +2,5 @@ train_batch_size = 128 eval_batch_size = 512 +lr = 0.003 * 0.5 gradient_accumulation_steps = 4 diff --git a/training/kunlunxin/vit-pytorch/config/config_R300x2x8.py b/training/kunlunxin/vit-pytorch/config/config_R300x2x8.py new file mode 100644 index 000000000..423721193 --- /dev/null +++ b/training/kunlunxin/vit-pytorch/config/config_R300x2x8.py @@ -0,0 +1,6 @@ +from config_common import * + +train_batch_size = 128 +eval_batch_size = 512 +gradient_accumulation_steps = 2 +# epochs = 8 diff --git a/training/kunlunxin/vit-pytorch/config/environment_variables.sh b/training/kunlunxin/vit-pytorch/config/environment_variables.sh new file mode 100644 index 000000000..fcdc18321 --- /dev/null +++ b/training/kunlunxin/vit-pytorch/config/environment_variables.sh @@ -0,0 +1,4 @@ +export XACC=1 +export BKCL_PCIE_RING=1 +export BKCL_TIMEOUT=1800 +export XMLIR_D_XPU_L3_SIZE=66060288 diff --git a/training/kunlunxin/vit-pytorch/config/requirements.txt b/training/kunlunxin/vit-pytorch/config/requirements.txt new file mode 100644 index 000000000..9ff33e446 --- /dev/null +++ b/training/kunlunxin/vit-pytorch/config/requirements.txt @@ -0,0 +1,2 @@ +https://download.pytorch.org/whl/cpu/torchvision-0.13.1%2Bcpu-cp38-cp38-linux_x86_64.whl +tabulate diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py index 4ddb45b7c..aa24f57b5 100644 --- a/training/run_benchmarks/config/test_conf.py +++ b/training/run_benchmarks/config/test_conf.py @@ -119,6 +119,7 @@ # "transformer_xl:pytorch:R300:1:8:1": "/raid/dataset/transformer_xl/", # "glm:pytorch:R300:1:8:1": "/raid/home_datasets_ckpt/glm/train/", # "mobilenetv2:pytorch:R300:1:8:1": "/raid/dataset/ImageNet_1k_2012/", + # "vit:pytorch:R300:1:8:1": "/raid/dataset/ImageNet_1k_2012/", # "bert:pytorch:R300:1:8:1": "/raid/dataset/bert_large/train", # "longformer:pytorch:R300:1:8:1": "/raid/dataset/longformer_train", # "distilbert:pytorch:R300:1:8:1": "/raid/dataset/distilbert/",