From 36ae8bc031d9faad065a04cc8fd0cf7d68c5583c Mon Sep 17 00:00:00 2001 From: jingyifa Date: Fri, 26 Jan 2024 16:01:48 +0800 Subject: [PATCH 01/10] add metax swin-transformer --- .../metax/swin_transformer-pytorch/README.md | 44 +++++++++++++++++++ .../config/config_C500x1x1.py | 4 ++ .../config/config_C500x1x8.py | 4 ++ .../config/config_C500x2x8.py | 4 ++ .../config/config_common.py | 2 + .../config/environment_variables.sh | 4 ++ .../config/requirements.txt | 6 +++ .../swin_transformer-pytorch/extern/.gitkeep | 0 8 files changed, 68 insertions(+) create mode 100644 training/metax/swin_transformer-pytorch/README.md create mode 100644 training/metax/swin_transformer-pytorch/config/config_C500x1x1.py create mode 100644 training/metax/swin_transformer-pytorch/config/config_C500x1x8.py create mode 100644 training/metax/swin_transformer-pytorch/config/config_C500x2x8.py create mode 100644 training/metax/swin_transformer-pytorch/config/config_common.py create mode 100644 training/metax/swin_transformer-pytorch/config/environment_variables.sh create mode 100644 training/metax/swin_transformer-pytorch/config/requirements.txt create mode 100644 training/metax/swin_transformer-pytorch/extern/.gitkeep diff --git a/training/metax/swin_transformer-pytorch/README.md b/training/metax/swin_transformer-pytorch/README.md new file mode 100644 index 000000000..7d1328f21 --- /dev/null +++ b/training/metax/swin_transformer-pytorch/README.md @@ -0,0 +1,44 @@ +### 测试数据集下载 +[测试数据集下载](../../benchmarks/swin_transformer/README.md#数据集) + +### 沐曦集成电路 C500 GPU配置与运行信息参考 +#### 环境配置 +- ##### 硬件环境 + - 机器、加速卡型号: 曦云®C500 64G + - 多机网络类型、带宽: InfiniBand,2x200 Gb/s +- ##### 软件环境 + - OS版本:Ubuntu 20.04.6 + - OS kernel版本: 5.4.0-26-generic + - 加速卡驱动版本:2.2.0 + - Docker 版本:24.0.7 + - 训练框架版本:pytorch-2.0.0+mc2.18.0.8-cp38-cp38-linux_x86_64.whl + - 依赖软件版本:无 + + +### 运行情况 +* 通用指标 + +| 指标名称 | 指标值 | 特殊说明 | +| -------------- | --------------------------------------------- | ------------------------------------------- | +| 任务类别 | Image Classification && Semantic Segmantation | | +| 模型 | swin_transformer | | +| 数据集 | Imagenet2012 1K | | +| 数据精度 | precision,见“性能指标” | 可选fp32/amp/fp16/tf32 | +| 超参修改 | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 | +| 硬件设备简称 | nvidia A100 | | +| 硬件存储使用 | mem,见“性能指标” | 通常称为“显存”,单位为GiB | +| 端到端时间 | e2e_time,见“性能指标” | 总时间+Perf初始化等时间 | +| 总吞吐量 | p_whole,见“性能指标” | 实际训练样本数除以总时间(performance_whole) | +| 训练吞吐量 | p_train,见“性能指标” | 不包含每个epoch末尾的评估部分耗时 | +| **计算吞吐量** | **p_core,见“性能指标”** | 不包含数据IO部分的耗时(p3>p2>p1) | +| 训练结果 | final_acc1,见“性能指标” | 验证准确率 | +| 额外修改项 | 无 | | + +* 性能指标 + +| 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | final_acc1 | mem | +| ----------------- | --------- | ------ | -------- | ------- | ------- | ------ | ---------- | --------- | +| MXC500单机8卡(1x8) | amp | / | | | | | 81.03 | 33.7/64.0 | +| MXC500单机8卡(1x8) | amp | bs=384 | / | | | | / | 40.1/64.0 | +| MXC500单机单卡(1x1) | amp | bs=384 | / | | | | / | 39.5/64.0 | +| MXC500两机8卡(2x8) | amp | bs=384 | / | | | | / | 40.0/64.0 | diff --git a/training/metax/swin_transformer-pytorch/config/config_C500x1x1.py b/training/metax/swin_transformer-pytorch/config/config_C500x1x1.py new file mode 100644 index 000000000..1872cec42 --- /dev/null +++ b/training/metax/swin_transformer-pytorch/config/config_C500x1x1.py @@ -0,0 +1,4 @@ +from config_common import * + +cfg = "configs/swin/swin_tiny_patch4_window7_224.yaml" +train_batch_size = 384 diff --git a/training/metax/swin_transformer-pytorch/config/config_C500x1x8.py b/training/metax/swin_transformer-pytorch/config/config_C500x1x8.py new file mode 100644 index 000000000..52ef64da3 --- /dev/null +++ b/training/metax/swin_transformer-pytorch/config/config_C500x1x8.py @@ -0,0 +1,4 @@ +from config_common import * + +cfg = "configs/swin/swin_tiny_patch4_window7_224.yaml" +train_batch_size = 256 diff --git a/training/metax/swin_transformer-pytorch/config/config_C500x2x8.py b/training/metax/swin_transformer-pytorch/config/config_C500x2x8.py new file mode 100644 index 000000000..1872cec42 --- /dev/null +++ b/training/metax/swin_transformer-pytorch/config/config_C500x2x8.py @@ -0,0 +1,4 @@ +from config_common import * + +cfg = "configs/swin/swin_tiny_patch4_window7_224.yaml" +train_batch_size = 384 diff --git a/training/metax/swin_transformer-pytorch/config/config_common.py b/training/metax/swin_transformer-pytorch/config/config_common.py new file mode 100644 index 000000000..851b29d4e --- /dev/null +++ b/training/metax/swin_transformer-pytorch/config/config_common.py @@ -0,0 +1,2 @@ +vendor = "metax" +dist_backend = "nccl" \ No newline at end of file diff --git a/training/metax/swin_transformer-pytorch/config/environment_variables.sh b/training/metax/swin_transformer-pytorch/config/environment_variables.sh new file mode 100644 index 000000000..e49e02a04 --- /dev/null +++ b/training/metax/swin_transformer-pytorch/config/environment_variables.sh @@ -0,0 +1,4 @@ +# ================================================= +# Export variables +# ================================================= + diff --git a/training/metax/swin_transformer-pytorch/config/requirements.txt b/training/metax/swin_transformer-pytorch/config/requirements.txt new file mode 100644 index 000000000..1a89c0b59 --- /dev/null +++ b/training/metax/swin_transformer-pytorch/config/requirements.txt @@ -0,0 +1,6 @@ +/root/.cache/torch/hub/checkpoints/torch-2.0.0+gite544b36-cp38-cp38-linux_x86_64.whl +numpy +tqdm +schedule +timm +pyyaml \ No newline at end of file diff --git a/training/metax/swin_transformer-pytorch/extern/.gitkeep b/training/metax/swin_transformer-pytorch/extern/.gitkeep new file mode 100644 index 000000000..e69de29bb From 2708a2eab0ab953f2da28e5fc5894f7a4c4fdb6c Mon Sep 17 00:00:00 2001 From: jingyifa Date: Fri, 26 Jan 2024 16:29:14 +0800 Subject: [PATCH 02/10] mod readme --- training/metax/swin_transformer-pytorch/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/metax/swin_transformer-pytorch/README.md b/training/metax/swin_transformer-pytorch/README.md index 7d1328f21..0f0d4b2d1 100644 --- a/training/metax/swin_transformer-pytorch/README.md +++ b/training/metax/swin_transformer-pytorch/README.md @@ -25,7 +25,7 @@ | 数据集 | Imagenet2012 1K | | | 数据精度 | precision,见“性能指标” | 可选fp32/amp/fp16/tf32 | | 超参修改 | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 | -| 硬件设备简称 | nvidia A100 | | +| 硬件设备简称 | MXC500 A100 | | | 硬件存储使用 | mem,见“性能指标” | 通常称为“显存”,单位为GiB | | 端到端时间 | e2e_time,见“性能指标” | 总时间+Perf初始化等时间 | | 总吞吐量 | p_whole,见“性能指标” | 实际训练样本数除以总时间(performance_whole) | From 330e19c518c18f2fb5ca6a5e4592ce95daa0fe22 Mon Sep 17 00:00:00 2001 From: jingyifa Date: Fri, 26 Jan 2024 16:31:05 +0800 Subject: [PATCH 03/10] mod readme --- training/metax/swin_transformer-pytorch/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/metax/swin_transformer-pytorch/README.md b/training/metax/swin_transformer-pytorch/README.md index 0f0d4b2d1..8cf199b19 100644 --- a/training/metax/swin_transformer-pytorch/README.md +++ b/training/metax/swin_transformer-pytorch/README.md @@ -25,7 +25,7 @@ | 数据集 | Imagenet2012 1K | | | 数据精度 | precision,见“性能指标” | 可选fp32/amp/fp16/tf32 | | 超参修改 | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 | -| 硬件设备简称 | MXC500 A100 | | +| 硬件设备简称 | MXC500 | | | 硬件存储使用 | mem,见“性能指标” | 通常称为“显存”,单位为GiB | | 端到端时间 | e2e_time,见“性能指标” | 总时间+Perf初始化等时间 | | 总吞吐量 | p_whole,见“性能指标” | 实际训练样本数除以总时间(performance_whole) | From 859979a811181c55d4b9793829ad1b2dc3566a06 Mon Sep 17 00:00:00 2001 From: jingyifa Date: Mon, 29 Jan 2024 11:08:57 +0800 Subject: [PATCH 04/10] mod swin --- training/metax/swin_transformer-pytorch/README.md | 6 +++--- .../swin_transformer-pytorch/config/config_C500x1x1.py | 1 + .../swin_transformer-pytorch/config/config_C500x2x8.py | 1 + 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/training/metax/swin_transformer-pytorch/README.md b/training/metax/swin_transformer-pytorch/README.md index 8cf199b19..46985f6a2 100644 --- a/training/metax/swin_transformer-pytorch/README.md +++ b/training/metax/swin_transformer-pytorch/README.md @@ -38,7 +38,7 @@ | 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | final_acc1 | mem | | ----------------- | --------- | ------ | -------- | ------- | ------- | ------ | ---------- | --------- | -| MXC500单机8卡(1x8) | amp | / | | | | | 81.03 | 33.7/64.0 | -| MXC500单机8卡(1x8) | amp | bs=384 | / | | | | / | 40.1/64.0 | +| MXC500单机8卡(1x8) | amp | / | | | | | | 33.7/64.0 | +| MXC500单机8卡(1x8) | amp | bs=384 | | | | | | 40.1/64.0 | | MXC500单机单卡(1x1) | amp | bs=384 | / | | | | / | 39.5/64.0 | -| MXC500两机8卡(2x8) | amp | bs=384 | / | | | | / | 40.0/64.0 | +| MXC500两机8卡(2x8) | amp | bs=384 | / | | | | / | 40.1/64.0 | diff --git a/training/metax/swin_transformer-pytorch/config/config_C500x1x1.py b/training/metax/swin_transformer-pytorch/config/config_C500x1x1.py index 1872cec42..d58144b29 100644 --- a/training/metax/swin_transformer-pytorch/config/config_C500x1x1.py +++ b/training/metax/swin_transformer-pytorch/config/config_C500x1x1.py @@ -2,3 +2,4 @@ cfg = "configs/swin/swin_tiny_patch4_window7_224.yaml" train_batch_size = 384 +train_epochs = 3 diff --git a/training/metax/swin_transformer-pytorch/config/config_C500x2x8.py b/training/metax/swin_transformer-pytorch/config/config_C500x2x8.py index 1872cec42..d1c6880b7 100644 --- a/training/metax/swin_transformer-pytorch/config/config_C500x2x8.py +++ b/training/metax/swin_transformer-pytorch/config/config_C500x2x8.py @@ -2,3 +2,4 @@ cfg = "configs/swin/swin_tiny_patch4_window7_224.yaml" train_batch_size = 384 +train_epochs = 40 From e84eab377f95631fb4f9e0f6fca5a46a38da2915 Mon Sep 17 00:00:00 2001 From: FaJingyi Date: Tue, 30 Jan 2024 10:12:01 +0800 Subject: [PATCH 05/10] Update README.md --- training/metax/swin_transformer-pytorch/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/training/metax/swin_transformer-pytorch/README.md b/training/metax/swin_transformer-pytorch/README.md index 46985f6a2..967c0fbae 100644 --- a/training/metax/swin_transformer-pytorch/README.md +++ b/training/metax/swin_transformer-pytorch/README.md @@ -38,7 +38,7 @@ | 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | final_acc1 | mem | | ----------------- | --------- | ------ | -------- | ------- | ------- | ------ | ---------- | --------- | -| MXC500单机8卡(1x8) | amp | / | | | | | | 33.7/64.0 | -| MXC500单机8卡(1x8) | amp | bs=384 | | | | | | 40.1/64.0 | +| MXC500单机8卡(1x8) | amp | / | | | | | 81.03 | 33.7/64.0 | +| MXC500单机8卡(1x8) | amp | bs=384 | | | | | 81.2 | 40.1/64.0 | | MXC500单机单卡(1x1) | amp | bs=384 | / | | | | / | 39.5/64.0 | | MXC500两机8卡(2x8) | amp | bs=384 | / | | | | / | 40.1/64.0 | From 181f100de1396c8701f928085fece4f36a761913 Mon Sep 17 00:00:00 2001 From: FaJingyi Date: Tue, 30 Jan 2024 10:12:51 +0800 Subject: [PATCH 06/10] Update config_common.py --- training/metax/swin_transformer-pytorch/config/config_common.py | 1 - 1 file changed, 1 deletion(-) diff --git a/training/metax/swin_transformer-pytorch/config/config_common.py b/training/metax/swin_transformer-pytorch/config/config_common.py index 851b29d4e..ab908442d 100644 --- a/training/metax/swin_transformer-pytorch/config/config_common.py +++ b/training/metax/swin_transformer-pytorch/config/config_common.py @@ -1,2 +1 @@ vendor = "metax" -dist_backend = "nccl" \ No newline at end of file From 83f6953c800597ea61b476af44dfef6c80aa70c9 Mon Sep 17 00:00:00 2001 From: FaJingyi Date: Tue, 30 Jan 2024 10:17:25 +0800 Subject: [PATCH 07/10] Update requirements.txt --- .../metax/swin_transformer-pytorch/config/requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/training/metax/swin_transformer-pytorch/config/requirements.txt b/training/metax/swin_transformer-pytorch/config/requirements.txt index 1a89c0b59..4f225b05d 100644 --- a/training/metax/swin_transformer-pytorch/config/requirements.txt +++ b/training/metax/swin_transformer-pytorch/config/requirements.txt @@ -1,6 +1,6 @@ -/root/.cache/torch/hub/checkpoints/torch-2.0.0+gite544b36-cp38-cp38-linux_x86_64.whl +http://repo.metax-tech.com/r/pypi/simple/torch-2.0.0+gite544b36-cp38-cp38-linux_x86_64.whl numpy tqdm schedule timm -pyyaml \ No newline at end of file +pyyaml From 4bcfa6bc5b822d2bc0d01e42ea9090c8a2871596 Mon Sep 17 00:00:00 2001 From: jingyifa Date: Wed, 31 Jan 2024 15:42:46 +0800 Subject: [PATCH 08/10] fix torch_six in swin_transformer --- training/benchmarks/swin_transformer/pytorch/utils/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/training/benchmarks/swin_transformer/pytorch/utils/utils.py b/training/benchmarks/swin_transformer/pytorch/utils/utils.py index 1b2225a5d..4c3235cd8 100644 --- a/training/benchmarks/swin_transformer/pytorch/utils/utils.py +++ b/training/benchmarks/swin_transformer/pytorch/utils/utils.py @@ -7,7 +7,8 @@ import torch import torch.distributed as dist -from torch._six import inf +# from torch._six import inf +from torch import inf def reduce_tensor(tensor): From 5dfa228fee9bd0af48738933c6e4a3f05106e33e Mon Sep 17 00:00:00 2001 From: FaJingyi Date: Thu, 1 Feb 2024 18:05:56 +0800 Subject: [PATCH 09/10] Update utils.py --- training/benchmarks/swin_transformer/pytorch/utils/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/training/benchmarks/swin_transformer/pytorch/utils/utils.py b/training/benchmarks/swin_transformer/pytorch/utils/utils.py index 4c3235cd8..49ea85fb2 100644 --- a/training/benchmarks/swin_transformer/pytorch/utils/utils.py +++ b/training/benchmarks/swin_transformer/pytorch/utils/utils.py @@ -7,7 +7,6 @@ import torch import torch.distributed as dist -# from torch._six import inf from torch import inf From 66c4a3cd2c4496f4082d84888a283d56fee22778 Mon Sep 17 00:00:00 2001 From: jingyifa Date: Tue, 5 Mar 2024 10:40:20 +0800 Subject: [PATCH 10/10] add metax swintrans-infer --- inference/benchmarks/swinTransformer/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/inference/benchmarks/swinTransformer/README.md b/inference/benchmarks/swinTransformer/README.md index 14304fed9..ba517c573 100644 --- a/inference/benchmarks/swinTransformer/README.md +++ b/inference/benchmarks/swinTransformer/README.md @@ -84,4 +84,5 @@ find ./val -name "*JPEG" | wc -l | ----------- | --------- | ---- | ---- | -------- | ----------- | ---------- | ------------- | ------------ | ----------- | ----------- | | tensorrt | fp16 | 512 |1011.7 | 1347.5 | 1511.3 | 1231.7 | 1359.1 | 6.8% | 81.7/83.2 | 19.9/40.0 | | tensorrt | fp32 | 256 | 856.9 | 761.5 | 794.3 | 789.2 | 826.4 | 8.2% | 83.2/83.2 | 20.0/40.0 | -| kunlunxin_xtcl| W32A16 | 256 | 543.745 | / | / | / | / | / | 0.832 | / | +| kunlunxin_xtcl| W32A16 | 256 | / | / | / | / | / | / | 0.832 | / | +| metax-nocompiler| fp16 | 512 | / | / | / | / | / | 6.5% | 0.832 |10.6/64.0 |