-
Notifications
You must be signed in to change notification settings - Fork 109
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* update ignore * update basic-spec * add benchmark * fix * fix
- Loading branch information
Showing
27 changed files
with
2,599 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,4 +6,5 @@ __pycache__/ | |
.pytest_cache | ||
training/result/* | ||
inference/result/* | ||
base/result/* | ||
inference/onnxs/* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
# 评测原理 | ||
|
||
1. 使用computation-bound的算子GEMM来评测芯片FP32算力 | ||
2. 此算子为计算密集型,在业界被广泛用于测试算力 | ||
|
||
# 适配修改规范 | ||
|
||
本评测样例配置文件如下: | ||
|
||
```yaml | ||
M: 4096 | ||
N: 4096 | ||
K: 4096 | ||
WARMUP: 100 | ||
ITERS: 10000 | ||
DIST_BACKEND: "mpi" | ||
``` | ||
1. M、N、K为GEMM算子的配置。本评测样例以[M,N]矩阵和[N,K]矩阵相乘作为计算内容。厂商可在正整数范围内任意调整此三项配置,发挥自身能力 | ||
例如,英伟达A800-80-SXM芯片采用M=8192、N=8192、K=8192 | ||
2. WARMUP为预热所需迭代次数。厂商可在正整数范围内任意调整此值。WARMUP迭代部分不计入性能计算 | ||
3. ITERS为正式评测迭代次数。厂商可在正整数范围内调整此值,同时保证总运行时间大于等于6分钟 | ||
4. DIST_BACKEND为通讯库。在本评测样例中,仅供初始化使用,无通信算子。厂商可任意调整为适用于自己的通讯库 | ||
例如,英伟达A800-80-SXM芯片采用DIST_BACKEND="nccl" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
M: 4096 | ||
N: 4096 | ||
K: 4096 | ||
WARMUP: 100 | ||
ITERS: 10000 | ||
DIST_BACKEND: "mpi" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
import torch | ||
import torch.distributed as dist | ||
import os | ||
import time | ||
from argparse import ArgumentParser, Namespace | ||
import yaml | ||
import sys | ||
sys.path.append("..") | ||
from drivers.utils import * | ||
|
||
|
||
def parse_args(): | ||
parser = ArgumentParser(description=" ") | ||
|
||
parser.add_argument("--vendor", | ||
type=str, | ||
required=True, | ||
help="vendor name like nvidia") | ||
|
||
parser.add_argument("--node_size", | ||
type=int, | ||
required=True, | ||
help="for pytorch") | ||
|
||
args, unknown_args = parser.parse_known_args() | ||
args.unknown_args = unknown_args | ||
return args | ||
|
||
|
||
def main(config, case_config, rank, world_size, local_rank): | ||
set_ieee_float32(config.vendor) | ||
if rank == 0: | ||
print("finish initialization") | ||
|
||
m = case_config.M | ||
n = case_config.N | ||
k = case_config.K | ||
|
||
|
||
matrixA = torch.randn(m, n, dtype=torch.float32).to(local_rank) | ||
matrixB = torch.randn(n, k, dtype=torch.float32).to(local_rank) | ||
|
||
host_device_sync(config.vendor) | ||
multi_device_sync(config.vendor) | ||
if rank == 0: | ||
print("start warmup") | ||
|
||
for _ in range(case_config.WARMUP): | ||
_result = torch.mm(matrixA, matrixB) | ||
|
||
host_device_sync(config.vendor) | ||
multi_device_sync(config.vendor) | ||
if rank == 0: | ||
print("start test") | ||
|
||
host_device_sync(config.vendor) | ||
multi_device_sync(config.vendor) | ||
start_time = time.perf_counter() | ||
|
||
for _ in range(case_config.ITERS): | ||
_result = torch.mm(matrixA, matrixB) | ||
|
||
host_device_sync(config.vendor) | ||
multi_device_sync(config.vendor) | ||
end_time = time.perf_counter() | ||
|
||
exec_time = end_time - start_time | ||
|
||
operations = case_config.ITERS * 2 * m * n * k | ||
tflops = operations / exec_time / 1e12 | ||
|
||
return round(tflops, 2) | ||
|
||
|
||
if __name__ == "__main__": | ||
config = parse_args() | ||
with open("case_config.yaml", "r") as file: | ||
case_config = yaml.safe_load(file) | ||
with open(os.path.join(config.vendor, "case_config.yaml"), "r") as file: | ||
case_config_vendor = yaml.safe_load(file) | ||
case_config.update(case_config_vendor) | ||
case_config = Namespace(**case_config) | ||
|
||
dist.init_process_group(backend=case_config.DIST_BACKEND) | ||
rank = dist.get_rank() | ||
world_size = dist.get_world_size() | ||
local_rank = rank % config.node_size | ||
|
||
result = main(config, case_config, rank, world_size, local_rank) | ||
|
||
multi_device_sync(config.vendor) | ||
for output_rank in range(config.node_size): | ||
if local_rank == output_rank: | ||
print(r"[FlagPerf Result]Rank {}'s computation-FP32=".format(dist.get_rank()) + str(result) + "TFLOPS") | ||
multi_device_sync(config.vendor) | ||
|
||
dist.destroy_process_group() | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
# 参评AI芯片信息 | ||
|
||
* 厂商:Nvidia | ||
|
||
## 服务器1 | ||
|
||
* 产品名称:A800 | ||
* 产品型号:A800-80GiB-SXM | ||
* TDP:400W | ||
|
||
## 服务器2 | ||
|
||
- 产品名称:A100 | ||
- 产品型号:A100-40GiB-SXM | ||
- TDP:400W | ||
|
||
# 所用服务器配置 | ||
|
||
* 服务器数量:2 | ||
|
||
## 服务器1 | ||
|
||
* 单服务器内使用卡数:8 | ||
* 服务器型号:DGX A100 | ||
* 操作系统版本:Ubuntu 20.04.1 LTS | ||
* 操作系统内核:linux5.4.0-126 | ||
* CPU:AMD EPYC7742-64core | ||
* docker版本:20.10.18 | ||
* 内存:1TiB | ||
* 服务器间AI芯片直连规格及带宽:200Gb*2 IB | ||
|
||
## 服务器2 | ||
|
||
- 单服务器内使用卡数:8 | ||
- 服务器型号:DGX A100 | ||
- 操作系统版本:Ubuntu 20.04.4 LTS | ||
- 操作系统内核:linux5.4.0-113 | ||
- CPU:AMD EPYC7742-64core | ||
- docker版本:20.10.16 | ||
- 内存:1TiB | ||
- 服务器间AI芯片直连规格及带宽:200Gb*2 IB | ||
|
||
# 评测结果 | ||
|
||
## 核心评测结果 | ||
|
||
| 评测项 | FP32算力测试值(16卡平均) | FP32算力标定值(16卡平均) | 测试标定比例(16卡平均) | | ||
| ---- | ---------------- | ---------------- | ------------- | | ||
| 评测结果 | 19.05TFLOPS | 19.5TFLOPS | 97.7% | | ||
|
||
## 能耗监控结果 | ||
|
||
| 监控项 | 系统平均功耗(2机平均) | 系统最大功耗(2机最大) | 系统功耗标准差(2机最大) | 单机TDP | 单卡平均功耗(16卡平均) | 单卡最大功耗(16卡最大) | 单卡功耗标准差(16卡最大) | 单卡TDP | | ||
| ---- | ------------ | ------------ | ------------- | ----- | ------------- | ------------- | -------------- | ----- | | ||
| 监控结果 | 3954.6W | 3978.0W | 38.21W | / | 317.2W | 376.0W | 83.7W | 400W | | ||
|
||
## 其他重要监控结果 | ||
|
||
| 监控项 | 系统平均CPU占用(2机平均) | 系统平均内存占用(2机平均) | 单卡平均温度(16卡平均) | 单卡平均显存占用(16卡平均) | | ||
| ---- | --------------- | -------------- | ------------- | --------------- | | ||
| 监控结果 | 3.456% | 2.777% | 60.12°C | 3.667% | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
M: 8192 | ||
N: 8192 | ||
K: 8192 | ||
DIST_BACKEND: "nccl" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
echo "NVIDIA PLACEHOLDER ENV.SH" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
loguru |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from .utils import * |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
import torch | ||
|
||
|
||
def set_ieee_float32(vendor): | ||
if vendor == "nvidia": | ||
torch.backends.cuda.matmul.allow_tf32 = False | ||
else: | ||
print("unspecified vendor {}, do nothing".format(vendor)) | ||
|
||
|
||
def unset_ieee_float32(vendor): | ||
if vendor == "nvidia": | ||
torch.backends.cuda.matmul.allow_tf32 = True | ||
else: | ||
print("unspecified vendor {}, do nothing".format(vendor)) | ||
|
||
|
||
def host_device_sync(vendor): | ||
if vendor == "nvidia": | ||
torch.cuda.synchronize() | ||
else: | ||
print("unspecified vendor {}, using default pytorch \"torch.cuda.synchronize\"".format(vendor)) | ||
torch.cuda.synchronize() | ||
|
||
|
||
def multi_device_sync(vendor): | ||
if vendor == "nvidia": | ||
torch.distributed.barrier() | ||
else: | ||
print("unspecified vendor {}, using default pytorch \"torch.distributed.barrier\"".format(vendor)) | ||
torch.distributed.barrier() | ||
|
||
|
||
def get_memory_capacity(vendor, rank): | ||
if vendor == "nvidia": | ||
return torch.cuda.get_device_properties(rank).total_memory | ||
else: | ||
print("unspecified vendor {}, return -1.0".format(vendor)) | ||
return -1.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
FLAGPERF_PATH: "/home/FlagPerf/base" | ||
FLAGPERF_LOG_PATH: "result" | ||
VENDOR: "nvidia" | ||
FLAGPERF_LOG_LEVEL: "info" | ||
# "BENCHMARK" means benchmarks(torch), "TOOLKIT" means toolkits | ||
# benchmarks using container_main to launch "torchrun benchmarks/<case>/main.py", nnodes * nproc | ||
# toolkits using container_main to launch bash toolkits/<case>/<vendor>/main.sh, nnodes. | ||
# only in benchmarks, flagperf will automatically execute benchmakrs/<case>/<vendor>/requirements.txt and env.sh | ||
# all resources to be used in toolkits/<case>/<vendor>/main.sh, should be under toolkits/<case>/<vendor>/ | ||
BENCHMARKS_OR_TOOLKITS: "BENCHMARK" | ||
HOSTS: ["192.168.1.2", "192.168.1.3"] | ||
NPROC_PER_NODE: 8 | ||
SSH_PORT: "22" | ||
HOSTS_PORTS: ["2222"] | ||
MASTER_PORT: "29501" | ||
SHM_SIZE: "32G" | ||
ACCE_CONTAINER_OPT: " --gpus all" | ||
# for nvidia, using " -- gpus all" | ||
# for xxx, using | ||
PIP_SOURCE: "https://mirror.baidu.com/pypi/simple" | ||
CLEAR_CACHES: True | ||
# for nvidia, using "CUDA_VISIBLE_DEVICES" | ||
# for xxx, using | ||
ACCE_VISIBLE_DEVICE_ENV_NAME: "CUDA_VISIBLE_DEVICES" | ||
CASES: | ||
"computation-FP32": "pytorch_2.3" |
Oops, something went wrong.