Skip to content

Commit

Permalink
gbps (FlagOpen#516)
Browse files Browse the repository at this point in the history
  • Loading branch information
shh2000 authored Apr 23, 2024
1 parent bc0f310 commit 8fb79ca
Show file tree
Hide file tree
Showing 11 changed files with 273 additions and 2 deletions.
27 changes: 27 additions & 0 deletions base/benchmarks/main_memory-bandwidth/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# 评测原理

1. 使用memory-bound的copy_方法来评测芯片主存储带宽
2. 此方法仅进行数据复制,不对torch中的梯度等进行处理。同时仅将数据值进行复制,不包含创建张量等操作,适用于评测芯片主存储带宽

# 适配修改规范

本评测样例配置文件如下:

```yaml
Melements: 1024
WARMUP: 100
ITERS: 100000
DIST_BACKEND: "mpi"
```
1. Melements为复制的fp32元素个数。厂商可在正整数范围内任意调整此项配置,发挥自身能力
例如,英伟达A100-40-SXM芯片采用Melements=1024
2. WARMUP为预热所需迭代次数。厂商可在正整数范围内任意调整此值。WARMUP迭代部分不计入性能计算
3. ITERS为正式评测迭代次数。厂商可在正整数范围内调整此值,同时保证总运行时间大于等于6分钟
4. DIST_BACKEND为通讯库。在本评测样例中,仅供初始化使用,无通信算子。厂商可任意调整为适用于自己的通讯库
例如,英伟达A100-40-SXM芯片采用DIST_BACKEND="nccl"
4 changes: 4 additions & 0 deletions base/benchmarks/main_memory-bandwidth/case_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Melements: 1024
WARMUP: 100
ITERS: 100000
DIST_BACKEND: "mpi"
97 changes: 97 additions & 0 deletions base/benchmarks/main_memory-bandwidth/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import torch
import torch.distributed as dist
import os
import time
from argparse import ArgumentParser, Namespace
import yaml
import sys
sys.path.append("..")
from drivers.utils import *


def parse_args():
parser = ArgumentParser(description=" ")

parser.add_argument("--vendor",
type=str,
required=True,
help="vendor name like nvidia")

parser.add_argument("--node_size",
type=int,
required=True,
help="for pytorch")

args, unknown_args = parser.parse_known_args()
args.unknown_args = unknown_args
return args


def main(config, case_config, rank, world_size, local_rank):
set_ieee_float32(config.vendor)
if rank == 0:
print("finish initialization")

Melements = case_config.Melements
torchsize = (Melements, 1024, 1024)
tensor = torch.rand(torchsize, dtype=torch.float32).to(local_rank)


host_device_sync(config.vendor)
multi_device_sync(config.vendor)
if rank == 0:
print("start warmup")

for _ in range(case_config.WARMUP):
_tensor = tensor.clone()


host_device_sync(config.vendor)
multi_device_sync(config.vendor)
start_time = time.perf_counter()

for _ in range(case_config.ITERS):
_tensor = tensor.clone()
torch.cuda.synchronize()


host_device_sync(config.vendor)
multi_device_sync(config.vendor)
end_time = time.perf_counter()

elapsed_time = end_time - start_time


datasize = case_config.ITERS * 2 * (Melements * 1024 * 1024 * 4 / 1E9)
bandwidth = datasize / elapsed_time
bandwidth_gib = bandwidth * 1E9 / (1024**3)

return round(bandwidth, 2), round(bandwidth_gib, 2)


if __name__ == "__main__":
config = parse_args()
with open("case_config.yaml", "r") as file:
case_config = yaml.safe_load(file)
with open(os.path.join(config.vendor, "case_config.yaml"), "r") as file:
case_config_vendor = yaml.safe_load(file)
case_config.update(case_config_vendor)
case_config = Namespace(**case_config)

dist.init_process_group(backend=case_config.DIST_BACKEND)
rank = dist.get_rank()
world_size = dist.get_world_size()
local_rank = rank % config.node_size

gb, gib = main(config, case_config, rank, world_size, local_rank)

multi_device_sync(config.vendor)
for output_rank in range(config.node_size):
if local_rank == output_rank:
print(r"[FlagPerf Result]Rank {}'s main_memory-bindwidth=".format(dist.get_rank()) + str(gb) + "GB/s")
print(r"[FlagPerf Result]Rank {}'s main_memory-bindwidth=".format(dist.get_rank()) + str(gib) + "GiB/s")
multi_device_sync(config.vendor)

dist.destroy_process_group()


42 changes: 42 additions & 0 deletions base/benchmarks/main_memory-bandwidth/nvidia/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# 参评AI芯片信息

* 厂商:Nvidia


* 产品名称:A100
* 产品型号:A100-40GiB-SXM
* TDP:400W

# 所用服务器配置

* 服务器数量:1


* 单服务器内使用卡数:8
* 服务器型号:DGX A100
* 操作系统版本:Ubuntu 20.04.4 LTS
* 操作系统内核:linux5.4.0-113
* CPU:AMD EPYC7742-64core
* docker版本:20.10.16
* 内存:1TiB
* 服务器间AI芯片直连规格及带宽:此评测项不涉及服务期间AI芯片直连

# 评测结果

## 核心评测结果

| 评测项 | 主存储带宽测试值(8卡平均) | 主存储带宽标定值(8卡平均) | 测试标定比例(8卡平均) |
| ---- | -------------- | -------------- | ------------ |
| 评测结果 | 1328.25GB/s | 1555GB/s | 85.4% |

## 能耗监控结果

| 监控项 | 系统平均功耗 | 系统最大功耗 | 系统功耗标准差 | 单机TDP | 单卡平均功耗(8卡平均) | 单卡最大功耗(8卡最大) | 单卡功耗标准差(8卡最大) | 单卡TDP |
| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
| 监控结果 | 3120.0W | 3120.0W | 0.0W | / | 239.3W | 266.0W | 44.58W | 400W |

## 其他重要监控结果

| 监控项 | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度(8卡平均) | 单卡平均显存占用(8卡平均) |
| ---- | --------- | -------- | ------------ | -------------- |
| 监控结果 | 3.334% | 1.496% | 53.29°C | 33.444% |
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
DIST_BACKEND: "nccl"
1 change: 1 addition & 0 deletions base/benchmarks/main_memory-bandwidth/nvidia/env.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
echo "NVIDIA PLACEHOLDER ENV.SH"
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
loguru
5 changes: 3 additions & 2 deletions base/configs/host.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,10 @@ ACCE_VISIBLE_DEVICE_ENV_NAME: "CUDA_VISIBLE_DEVICES"
CASES:
"computation-FP32": "pytorch_2.3"

# nvidia "computation-FP64":"pytorch_2.3"
# nvidia "computation-FP32":"pytorch_2.3"
# nvidia "computation-TF32":"pytorch_2.3"
# nvidia "computation-FP16":"pytorch_2.3"
# nvidia "computation-FP64":"pytorch_2.3"
# nvidia "computation-BF16":"pytorch_2.3"
# nvidia "computation-INT8":"pytorch_2.3"
# nvidia "computation-INT8":"pytorch_2.3"
# nvidia "main_memory-bandwidth":"pytorch_2.3"
42 changes: 42 additions & 0 deletions base/toolkits/main_memory-bandwidth/nvidia/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# 参评AI芯片信息

* 厂商:Nvidia
* 产品名称:A100
* 产品型号:A100-40GiB-SXM
* TDP:400W

# 所用服务器配置

* 服务器数量:1
* 单服务器内使用卡数:1
* 服务器型号:DGX A100
* 操作系统版本:Ubuntu 20.04.4 LTS
* 操作系统内核:linux5.4.0-113
* CPU:AMD EPYC7742-64core
* docker版本:20.10.16
* 内存:1TiB
* 服务器间AI芯片直连规格及带宽:此评测样例无需服务器间通信

# 评测结果

## 核心评测结果

| 评测项 | 主存储带宽测试值 | 主存储带宽标定值 | 测试标定比例 |
| ---- | ----------- | -------- | ------ |
| 评测结果 | 1329.57GB/s | 1555GB/s | 85.5% |

## 能耗监控结果

| 监控项 | 系统平均功耗 | 系统最大功耗 | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
| ---- | ------- | ------- | ------- | ----- | ------- | ------ | ------- | ----- |
| 监控结果 | 1560.0W | 1560.0W | 0.0W | / | 168.38W | 178.0W | 29.3W | 400W |

## 其他重要监控结果

| 监控项 | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡平均显存占用 |
| ---- | --------- | -------- | ------- | -------- |
| 监控结果 | 0.649% | 1.007% | 41.95°C | 76.018% |

# 厂商测试工具原理说明

使用cudaMemcpy,进行读+写AI芯片主存储操作,计算AI芯片主存储带宽
53 changes: 53 additions & 0 deletions base/toolkits/main_memory-bandwidth/nvidia/bandwidth.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#include <stdio.h>
#include <cuda_runtime.h>

#define GB (1024ULL * 1024ULL * 1024ULL)
#define SIZE (16ULL * GB)
#define WARMUP_ITERATIONS 100
#define ITERATIONS 10000

void checkCudaError(cudaError_t err, const char *msg) {
if (err != cudaSuccess) {
fprintf(stderr, "CUDA Error: %s: %s\n", msg, cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}

int main() {
float *d_src, *d_dst;
cudaEvent_t start, end;
float elapsed_time;

checkCudaError(cudaMalloc(&d_src, SIZE), "cudaMalloc");
checkCudaError(cudaMalloc(&d_dst, SIZE), "cudaMalloc");

checkCudaError(cudaEventCreate(&start), "cudaEventCreate");
checkCudaError(cudaEventCreate(&end), "cudaEventCreate");

for (int i = 0; i < WARMUP_ITERATIONS; ++i) {
checkCudaError(cudaMemcpy(d_dst, d_src, SIZE, cudaMemcpyDeviceToDevice), "cudaMemcpy");
}

checkCudaError(cudaEventRecord(start), "cudaEventRecord");

for (int i = 0; i < ITERATIONS; ++i) {
checkCudaError(cudaMemcpy(d_dst, d_src, SIZE, cudaMemcpyDeviceToDevice), "cudaMemcpy");
}

checkCudaError(cudaEventRecord(end), "cudaEventRecord");
checkCudaError(cudaEventSynchronize(end), "cudaEventSynchronize");

checkCudaError(cudaEventElapsedTime(&elapsed_time, start, end), "cudaEventElapsedTime");

double bandwidth = 2.0 * SIZE * ITERATIONS / (elapsed_time / 1000.0);

printf("[FlagPerf Result]main_memory-bandwidth=%.2fGiB/s\n", bandwidth / (1024.0 * 1024.0 * 1024.0));
printf("[FlagPerf Result]main_memory-bandwidth=%.2fGB/s\n", bandwidth / (1000.0 * 1000.0 * 1000.0));

checkCudaError(cudaFree(d_src), "cudaFree");
checkCudaError(cudaFree(d_dst), "cudaFree");
checkCudaError(cudaEventDestroy(start), "cudaEventDestroy");
checkCudaError(cudaEventDestroy(end), "cudaEventDestroy");

return 0;
}
2 changes: 2 additions & 0 deletions base/toolkits/main_memory-bandwidth/nvidia/main.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
nvcc bandwidth.cu -o bdtest
./bdtest

0 comments on commit 8fb79ca

Please sign in to comment.