From 56d5825739cd783e588930379d0c92bb6796ee61 Mon Sep 17 00:00:00 2001 From: happyxuwork Date: Mon, 4 Mar 2024 17:03:59 +0800 Subject: [PATCH] =?UTF-8?q?=E3=80=90metax=E3=80=91add=20model=20mask=5Frcn?= =?UTF-8?q?n=20and=20detr=20(#459)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add model mask_rcnn and detr * maskrcnn & detr model logs --- training/metax/detr-pytorch/README.md | 45 +++++++++++++++++ .../detr-pytorch/config/config_C500x1x1.py | 4 ++ .../detr-pytorch/config/config_C500x1x8.py | 4 ++ .../detr-pytorch/config/config_C500x2x8.py | 4 ++ .../detr-pytorch/config/config_common.py | 2 + .../config/environment_variables.sh | 5 ++ .../detr-pytorch/config/requirements.txt | 6 +++ training/metax/detr-pytorch/extern/.gitkeep | 0 training/metax/mask_rcnn-pytorch/README.md | 50 +++++++++++++++++++ .../config/config_C500x1x1.py | 5 ++ .../config/config_C500x1x8.py | 4 ++ .../config/config_C500x2x8.py | 5 ++ .../config/environment_variables.sh | 5 ++ .../mask_rcnn-pytorch/config/requirements.txt | 4 ++ .../metax/mask_rcnn-pytorch/extern/.gitkeep | 0 training/run_benchmarks/config/test_conf.py | 2 + 16 files changed, 145 insertions(+) create mode 100644 training/metax/detr-pytorch/README.md create mode 100644 training/metax/detr-pytorch/config/config_C500x1x1.py create mode 100644 training/metax/detr-pytorch/config/config_C500x1x8.py create mode 100644 training/metax/detr-pytorch/config/config_C500x2x8.py create mode 100644 training/metax/detr-pytorch/config/config_common.py create mode 100644 training/metax/detr-pytorch/config/environment_variables.sh create mode 100644 training/metax/detr-pytorch/config/requirements.txt create mode 100644 training/metax/detr-pytorch/extern/.gitkeep create mode 100644 training/metax/mask_rcnn-pytorch/README.md create mode 100644 training/metax/mask_rcnn-pytorch/config/config_C500x1x1.py create mode 100644 training/metax/mask_rcnn-pytorch/config/config_C500x1x8.py create mode 100644 training/metax/mask_rcnn-pytorch/config/config_C500x2x8.py create mode 100644 training/metax/mask_rcnn-pytorch/config/environment_variables.sh create mode 100644 training/metax/mask_rcnn-pytorch/config/requirements.txt create mode 100644 training/metax/mask_rcnn-pytorch/extern/.gitkeep diff --git a/training/metax/detr-pytorch/README.md b/training/metax/detr-pytorch/README.md new file mode 100644 index 000000000..a16e7260d --- /dev/null +++ b/training/metax/detr-pytorch/README.md @@ -0,0 +1,45 @@ +### 测试数据集下载 +参见[测试数据集下载](../../benchmarks/detr/README.md#测试数据集下载地址) + +### 沐曦集成电路 C500 GPU配置与运行信息参考 +#### 环境配置 +- ##### 硬件环境 + - 机器、加速卡型号: 曦云®C500 64G + - 多机网络类型、带宽: InfiniBand,2x200 Gb/s + +- ##### 软件环境 + - OS版本:Ubuntu 20.04.6 + - OS kernel版本: 5.4.0-26-generic + - 加速卡驱动版本:2.2.0 + - Docker 版本:24.0.7 + - 训练框架版本:pytorch-2.0.0+mc2.19.0.6-cp38-cp38-linux_x86_64.whl + - 依赖软件版本:无 + +#### 运行情况 + +* 通用指标 + +| 指标名称 | 指标值 | 特殊说明 | +| -------------- | ------------------------------ | ------------------------------------------- | +| 任务类别 | 目标检测、全景分割 | | +| 模型 | detr | | +| 数据集 | coco2017 | | +| 数据精度 | precision,见“性能指标” | 可选fp32/amp/fp16 | +| 超参修改 | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 | +| 硬件设备简称 | nvidia A100 | | +| 硬件存储使用 | mem(actual/total),见“性能指标” | 通常称为“显存”,单位为GiB | +| 端到端时间 | e2e_time,见“性能指标” | 总时间+Perf初始化等时间 | +| 总吞吐量 | p_whole,见“性能指标” | 实际训练样本数除以总时间(performance_whole) | +| 训练吞吐量 | p_train,见“性能指标” | 不包含每个epoch末尾的评估部分耗时 | +| **计算吞吐量** | **p_core,见“性能指标”** | 不包含数据IO部分的耗时(p3>p2>p1) | +| 训练结果 | mAP,见“性能指标” | mean Average Precision | +| 额外修改项 | 无 | | + + +* 性能指标 + +| 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | mAP | mem | +| ------------------- | --------- | -------------- | -------- | ------- | ------- | ------ | ----- | --------- | +| MXC500 单机8卡(1x8) | fp32 | bs=8,lr=0.0001 | | | | |39.6%| 57.2/64.0 | +| MXC500 单机单卡(1x1)| fp32 | / | | | | | | 60.7/64.0 | +| MXC500 两机16卡(2x8) | fp32 | / | | | | | | 46.3/64.0 | diff --git a/training/metax/detr-pytorch/config/config_C500x1x1.py b/training/metax/detr-pytorch/config/config_C500x1x1.py new file mode 100644 index 000000000..f09c8b6c3 --- /dev/null +++ b/training/metax/detr-pytorch/config/config_C500x1x1.py @@ -0,0 +1,4 @@ +from config_common import * + +train_batch_size = 8 +eval_batch_size = 8 \ No newline at end of file diff --git a/training/metax/detr-pytorch/config/config_C500x1x8.py b/training/metax/detr-pytorch/config/config_C500x1x8.py new file mode 100644 index 000000000..f09c8b6c3 --- /dev/null +++ b/training/metax/detr-pytorch/config/config_C500x1x8.py @@ -0,0 +1,4 @@ +from config_common import * + +train_batch_size = 8 +eval_batch_size = 8 \ No newline at end of file diff --git a/training/metax/detr-pytorch/config/config_C500x2x8.py b/training/metax/detr-pytorch/config/config_C500x2x8.py new file mode 100644 index 000000000..f09c8b6c3 --- /dev/null +++ b/training/metax/detr-pytorch/config/config_C500x2x8.py @@ -0,0 +1,4 @@ +from config_common import * + +train_batch_size = 8 +eval_batch_size = 8 \ No newline at end of file diff --git a/training/metax/detr-pytorch/config/config_common.py b/training/metax/detr-pytorch/config/config_common.py new file mode 100644 index 000000000..851b29d4e --- /dev/null +++ b/training/metax/detr-pytorch/config/config_common.py @@ -0,0 +1,2 @@ +vendor = "metax" +dist_backend = "nccl" \ No newline at end of file diff --git a/training/metax/detr-pytorch/config/environment_variables.sh b/training/metax/detr-pytorch/config/environment_variables.sh new file mode 100644 index 000000000..a7f429ac2 --- /dev/null +++ b/training/metax/detr-pytorch/config/environment_variables.sh @@ -0,0 +1,5 @@ +# ================================================= +# Export variables +# ================================================= + +export METAX_USE_TF32=1 diff --git a/training/metax/detr-pytorch/config/requirements.txt b/training/metax/detr-pytorch/config/requirements.txt new file mode 100644 index 000000000..061205713 --- /dev/null +++ b/training/metax/detr-pytorch/config/requirements.txt @@ -0,0 +1,6 @@ +cython +git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI&egg=pycocotools +submitit +scipy +onnx +onnxruntime \ No newline at end of file diff --git a/training/metax/detr-pytorch/extern/.gitkeep b/training/metax/detr-pytorch/extern/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/training/metax/mask_rcnn-pytorch/README.md b/training/metax/mask_rcnn-pytorch/README.md new file mode 100644 index 000000000..3247c5e42 --- /dev/null +++ b/training/metax/mask_rcnn-pytorch/README.md @@ -0,0 +1,50 @@ +### 模型backbone权重下载 +[模型backbone权重下载](../../benchmarks/mask_rcnn) + +### 测试数据集下载 + +[测试数据集下载](https://cocodataset.org/) + +### 沐曦集成电路 C500 GPU配置与运行信息参考 +#### 环境配置 +- ##### 硬件环境 + - 机器、加速卡型号: 曦云®C500 64G + - 多机网络类型、带宽: InfiniBand,2x200 Gb/s + +- ##### 软件环境 + - OS版本:Ubuntu 20.04.6 + - OS kernel版本: 5.4.0-26-generic + - 加速卡驱动版本:2.2.0 + - Docker 版本:24.0.7 + - 训练框架版本:pytorch-2.0.0+mc2.19.0.6-cp38-cp38-linux_x86_64.whl + - 依赖软件版本:无 + + + + +* 通用指标 + +| 指标名称 | 指标值 | 特殊说明 | +| -------------- | ----------------------- | ------------------------------------------- | +| 任务类别 | 图像目标检测 | | +| 模型 | fasterRCNN | | +| 数据集 | coco2017 | | +| 数据精度 | precision,见“性能指标” | 可选fp32/amp/fp16 | +| 超参修改 | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 | +| 硬件设备简称 | MXC500 | | +| 硬件存储使用 | mem,见“性能指标” | 通常称为“显存”,单位为GiB | +| 端到端时间 | e2e_time,见“性能指标” | 总时间+Perf初始化等时间 | +| 总吞吐量 | p_whole,见“性能指标” | 实际训练图片数除以总时间(performance_whole) | +| 训练吞吐量 | p_train,见“性能指标” | 不包含每个epoch末尾的评估部分耗时 | +| **计算吞吐量** | **p_core,见“性能指标”** | 不包含数据IO部分的耗时(p3>p2>p1) | +| 训练结果 | map,见“性能指标” | 单位为平均目标检测正确率 | +| 额外修改项 | 无 | | + + +* 性能指标 + +| 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | mAP | mem | +| ------------------- | --------- | -------------- | -------- | ------- | ------- | ------ | ----- | --------- | +| MXC500 单机8卡(1x8) | fp32 | bs=8,lr=0.0001 | | | | |0.382 && 0.343| 37.1/64.0 | +| MXC500 单机单卡(1x1)| fp32 | / | | | | | | 36.2/64.0 | +| MXC500 两机16卡(2x8) | fp32 | / | | | | | | 37.1/64.0 | diff --git a/training/metax/mask_rcnn-pytorch/config/config_C500x1x1.py b/training/metax/mask_rcnn-pytorch/config/config_C500x1x1.py new file mode 100644 index 000000000..b14441400 --- /dev/null +++ b/training/metax/mask_rcnn-pytorch/config/config_C500x1x1.py @@ -0,0 +1,5 @@ +vendor: str = "metax" +train_batch_size = 16 +eval_batch_size = 16 +lr = 0.16 +max_epoch: int = 1 \ No newline at end of file diff --git a/training/metax/mask_rcnn-pytorch/config/config_C500x1x8.py b/training/metax/mask_rcnn-pytorch/config/config_C500x1x8.py new file mode 100644 index 000000000..c11690f00 --- /dev/null +++ b/training/metax/mask_rcnn-pytorch/config/config_C500x1x8.py @@ -0,0 +1,4 @@ +vendor: str = "metax" +train_batch_size = 16 +eval_batch_size = 16 +lr = 0.16 \ No newline at end of file diff --git a/training/metax/mask_rcnn-pytorch/config/config_C500x2x8.py b/training/metax/mask_rcnn-pytorch/config/config_C500x2x8.py new file mode 100644 index 000000000..e81bc64bb --- /dev/null +++ b/training/metax/mask_rcnn-pytorch/config/config_C500x2x8.py @@ -0,0 +1,5 @@ +vendor: str = "metax" +train_batch_size = 16 +eval_batch_size = 16 +lr = 0.016 +max_epoch: int = 4 \ No newline at end of file diff --git a/training/metax/mask_rcnn-pytorch/config/environment_variables.sh b/training/metax/mask_rcnn-pytorch/config/environment_variables.sh new file mode 100644 index 000000000..a7f429ac2 --- /dev/null +++ b/training/metax/mask_rcnn-pytorch/config/environment_variables.sh @@ -0,0 +1,5 @@ +# ================================================= +# Export variables +# ================================================= + +export METAX_USE_TF32=1 diff --git a/training/metax/mask_rcnn-pytorch/config/requirements.txt b/training/metax/mask_rcnn-pytorch/config/requirements.txt new file mode 100644 index 000000000..846b45e40 --- /dev/null +++ b/training/metax/mask_rcnn-pytorch/config/requirements.txt @@ -0,0 +1,4 @@ +pycocotools +numpy +tqdm +schedule \ No newline at end of file diff --git a/training/metax/mask_rcnn-pytorch/extern/.gitkeep b/training/metax/mask_rcnn-pytorch/extern/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py index 8175b22fd..a81790fe6 100644 --- a/training/run_benchmarks/config/test_conf.py +++ b/training/run_benchmarks/config/test_conf.py @@ -157,5 +157,7 @@ # "bert_hf:pytorch_2.0:C500:1:8:1": "/raid/dataset/bert_hf_train", # "glm:pytorch_2.0:C500:1:8:1": "/raid/home_datasets_ckpt/glm/train/", # "mobilenetv2:pytorch_2.0:C500:1:8:1": "/raid/dataset/ImageNet_1k_2012/", + # "mask_rcnn:pytorch_2.0:C500:1:8:1": "/raid/dataset/coco2017/", + # "detr:pytorch_2.0:C500:1:8:1": "/raid/dataset/coco2017/", }