forked from FlagOpen/FlagPerf
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
11 changed files
with
273 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
# 评测原理 | ||
|
||
1. 使用memory-bound的copy_方法来评测芯片主存储带宽 | ||
2. 此方法仅进行数据复制,不对torch中的梯度等进行处理。同时仅将数据值进行复制,不包含创建张量等操作,适用于评测芯片主存储带宽 | ||
|
||
# 适配修改规范 | ||
|
||
本评测样例配置文件如下: | ||
|
||
```yaml | ||
Melements: 1024 | ||
WARMUP: 100 | ||
ITERS: 100000 | ||
DIST_BACKEND: "mpi" | ||
``` | ||
1. Melements为复制的fp32元素个数。厂商可在正整数范围内任意调整此项配置,发挥自身能力 | ||
例如,英伟达A100-40-SXM芯片采用Melements=1024 | ||
2. WARMUP为预热所需迭代次数。厂商可在正整数范围内任意调整此值。WARMUP迭代部分不计入性能计算 | ||
3. ITERS为正式评测迭代次数。厂商可在正整数范围内调整此值,同时保证总运行时间大于等于6分钟 | ||
4. DIST_BACKEND为通讯库。在本评测样例中,仅供初始化使用,无通信算子。厂商可任意调整为适用于自己的通讯库 | ||
例如,英伟达A100-40-SXM芯片采用DIST_BACKEND="nccl" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
Melements: 1024 | ||
WARMUP: 100 | ||
ITERS: 100000 | ||
DIST_BACKEND: "mpi" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
import torch | ||
import torch.distributed as dist | ||
import os | ||
import time | ||
from argparse import ArgumentParser, Namespace | ||
import yaml | ||
import sys | ||
sys.path.append("..") | ||
from drivers.utils import * | ||
|
||
|
||
def parse_args(): | ||
parser = ArgumentParser(description=" ") | ||
|
||
parser.add_argument("--vendor", | ||
type=str, | ||
required=True, | ||
help="vendor name like nvidia") | ||
|
||
parser.add_argument("--node_size", | ||
type=int, | ||
required=True, | ||
help="for pytorch") | ||
|
||
args, unknown_args = parser.parse_known_args() | ||
args.unknown_args = unknown_args | ||
return args | ||
|
||
|
||
def main(config, case_config, rank, world_size, local_rank): | ||
set_ieee_float32(config.vendor) | ||
if rank == 0: | ||
print("finish initialization") | ||
|
||
Melements = case_config.Melements | ||
torchsize = (Melements, 1024, 1024) | ||
tensor = torch.rand(torchsize, dtype=torch.float32).to(local_rank) | ||
|
||
|
||
host_device_sync(config.vendor) | ||
multi_device_sync(config.vendor) | ||
if rank == 0: | ||
print("start warmup") | ||
|
||
for _ in range(case_config.WARMUP): | ||
_tensor = tensor.clone() | ||
|
||
|
||
host_device_sync(config.vendor) | ||
multi_device_sync(config.vendor) | ||
start_time = time.perf_counter() | ||
|
||
for _ in range(case_config.ITERS): | ||
_tensor = tensor.clone() | ||
torch.cuda.synchronize() | ||
|
||
|
||
host_device_sync(config.vendor) | ||
multi_device_sync(config.vendor) | ||
end_time = time.perf_counter() | ||
|
||
elapsed_time = end_time - start_time | ||
|
||
|
||
datasize = case_config.ITERS * 2 * (Melements * 1024 * 1024 * 4 / 1E9) | ||
bandwidth = datasize / elapsed_time | ||
bandwidth_gib = bandwidth * 1E9 / (1024**3) | ||
|
||
return round(bandwidth, 2), round(bandwidth_gib, 2) | ||
|
||
|
||
if __name__ == "__main__": | ||
config = parse_args() | ||
with open("case_config.yaml", "r") as file: | ||
case_config = yaml.safe_load(file) | ||
with open(os.path.join(config.vendor, "case_config.yaml"), "r") as file: | ||
case_config_vendor = yaml.safe_load(file) | ||
case_config.update(case_config_vendor) | ||
case_config = Namespace(**case_config) | ||
|
||
dist.init_process_group(backend=case_config.DIST_BACKEND) | ||
rank = dist.get_rank() | ||
world_size = dist.get_world_size() | ||
local_rank = rank % config.node_size | ||
|
||
gb, gib = main(config, case_config, rank, world_size, local_rank) | ||
|
||
multi_device_sync(config.vendor) | ||
for output_rank in range(config.node_size): | ||
if local_rank == output_rank: | ||
print(r"[FlagPerf Result]Rank {}'s main_memory-bindwidth=".format(dist.get_rank()) + str(gb) + "GB/s") | ||
print(r"[FlagPerf Result]Rank {}'s main_memory-bindwidth=".format(dist.get_rank()) + str(gib) + "GiB/s") | ||
multi_device_sync(config.vendor) | ||
|
||
dist.destroy_process_group() | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
# 参评AI芯片信息 | ||
|
||
* 厂商:Nvidia | ||
|
||
|
||
* 产品名称:A100 | ||
* 产品型号:A100-40GiB-SXM | ||
* TDP:400W | ||
|
||
# 所用服务器配置 | ||
|
||
* 服务器数量:1 | ||
|
||
|
||
* 单服务器内使用卡数:8 | ||
* 服务器型号:DGX A100 | ||
* 操作系统版本:Ubuntu 20.04.4 LTS | ||
* 操作系统内核:linux5.4.0-113 | ||
* CPU:AMD EPYC7742-64core | ||
* docker版本:20.10.16 | ||
* 内存:1TiB | ||
* 服务器间AI芯片直连规格及带宽:此评测项不涉及服务期间AI芯片直连 | ||
|
||
# 评测结果 | ||
|
||
## 核心评测结果 | ||
|
||
| 评测项 | 主存储带宽测试值(8卡平均) | 主存储带宽标定值(8卡平均) | 测试标定比例(8卡平均) | | ||
| ---- | -------------- | -------------- | ------------ | | ||
| 评测结果 | 1328.25GB/s | 1555GB/s | 85.4% | | ||
|
||
## 能耗监控结果 | ||
|
||
| 监控项 | 系统平均功耗 | 系统最大功耗 | 系统功耗标准差 | 单机TDP | 单卡平均功耗(8卡平均) | 单卡最大功耗(8卡最大) | 单卡功耗标准差(8卡最大) | 单卡TDP | | ||
| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- | | ||
| 监控结果 | 3120.0W | 3120.0W | 0.0W | / | 239.3W | 266.0W | 44.58W | 400W | | ||
|
||
## 其他重要监控结果 | ||
|
||
| 监控项 | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度(8卡平均) | 单卡平均显存占用(8卡平均) | | ||
| ---- | --------- | -------- | ------------ | -------------- | | ||
| 监控结果 | 3.334% | 1.496% | 53.29°C | 33.444% | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
DIST_BACKEND: "nccl" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
echo "NVIDIA PLACEHOLDER ENV.SH" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
loguru |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
# 参评AI芯片信息 | ||
|
||
* 厂商:Nvidia | ||
* 产品名称:A100 | ||
* 产品型号:A100-40GiB-SXM | ||
* TDP:400W | ||
|
||
# 所用服务器配置 | ||
|
||
* 服务器数量:1 | ||
* 单服务器内使用卡数:1 | ||
* 服务器型号:DGX A100 | ||
* 操作系统版本:Ubuntu 20.04.4 LTS | ||
* 操作系统内核:linux5.4.0-113 | ||
* CPU:AMD EPYC7742-64core | ||
* docker版本:20.10.16 | ||
* 内存:1TiB | ||
* 服务器间AI芯片直连规格及带宽:此评测样例无需服务器间通信 | ||
|
||
# 评测结果 | ||
|
||
## 核心评测结果 | ||
|
||
| 评测项 | 主存储带宽测试值 | 主存储带宽标定值 | 测试标定比例 | | ||
| ---- | ----------- | -------- | ------ | | ||
| 评测结果 | 1329.57GB/s | 1555GB/s | 85.5% | | ||
|
||
## 能耗监控结果 | ||
|
||
| 监控项 | 系统平均功耗 | 系统最大功耗 | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP | | ||
| ---- | ------- | ------- | ------- | ----- | ------- | ------ | ------- | ----- | | ||
| 监控结果 | 1560.0W | 1560.0W | 0.0W | / | 168.38W | 178.0W | 29.3W | 400W | | ||
|
||
## 其他重要监控结果 | ||
|
||
| 监控项 | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡平均显存占用 | | ||
| ---- | --------- | -------- | ------- | -------- | | ||
| 监控结果 | 0.649% | 1.007% | 41.95°C | 76.018% | | ||
|
||
# 厂商测试工具原理说明 | ||
|
||
使用cudaMemcpy,进行读+写AI芯片主存储操作,计算AI芯片主存储带宽 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
#include <stdio.h> | ||
#include <cuda_runtime.h> | ||
|
||
#define GB (1024ULL * 1024ULL * 1024ULL) | ||
#define SIZE (16ULL * GB) | ||
#define WARMUP_ITERATIONS 100 | ||
#define ITERATIONS 10000 | ||
|
||
void checkCudaError(cudaError_t err, const char *msg) { | ||
if (err != cudaSuccess) { | ||
fprintf(stderr, "CUDA Error: %s: %s\n", msg, cudaGetErrorString(err)); | ||
exit(EXIT_FAILURE); | ||
} | ||
} | ||
|
||
int main() { | ||
float *d_src, *d_dst; | ||
cudaEvent_t start, end; | ||
float elapsed_time; | ||
|
||
checkCudaError(cudaMalloc(&d_src, SIZE), "cudaMalloc"); | ||
checkCudaError(cudaMalloc(&d_dst, SIZE), "cudaMalloc"); | ||
|
||
checkCudaError(cudaEventCreate(&start), "cudaEventCreate"); | ||
checkCudaError(cudaEventCreate(&end), "cudaEventCreate"); | ||
|
||
for (int i = 0; i < WARMUP_ITERATIONS; ++i) { | ||
checkCudaError(cudaMemcpy(d_dst, d_src, SIZE, cudaMemcpyDeviceToDevice), "cudaMemcpy"); | ||
} | ||
|
||
checkCudaError(cudaEventRecord(start), "cudaEventRecord"); | ||
|
||
for (int i = 0; i < ITERATIONS; ++i) { | ||
checkCudaError(cudaMemcpy(d_dst, d_src, SIZE, cudaMemcpyDeviceToDevice), "cudaMemcpy"); | ||
} | ||
|
||
checkCudaError(cudaEventRecord(end), "cudaEventRecord"); | ||
checkCudaError(cudaEventSynchronize(end), "cudaEventSynchronize"); | ||
|
||
checkCudaError(cudaEventElapsedTime(&elapsed_time, start, end), "cudaEventElapsedTime"); | ||
|
||
double bandwidth = 2.0 * SIZE * ITERATIONS / (elapsed_time / 1000.0); | ||
|
||
printf("[FlagPerf Result]main_memory-bandwidth=%.2fGiB/s\n", bandwidth / (1024.0 * 1024.0 * 1024.0)); | ||
printf("[FlagPerf Result]main_memory-bandwidth=%.2fGB/s\n", bandwidth / (1000.0 * 1000.0 * 1000.0)); | ||
|
||
checkCudaError(cudaFree(d_src), "cudaFree"); | ||
checkCudaError(cudaFree(d_dst), "cudaFree"); | ||
checkCudaError(cudaEventDestroy(start), "cudaEventDestroy"); | ||
checkCudaError(cudaEventDestroy(end), "cudaEventDestroy"); | ||
|
||
return 0; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
nvcc bandwidth.cu -o bdtest | ||
./bdtest |