gbps (FlagOpen#516)

shh2000 · Apr 23, 2024 · 8fb79ca · 8fb79ca
1 parent bc0f310
commit 8fb79ca
Show file tree

Hide file tree

Showing 11 changed files with 273 additions and 2 deletions.
diff --git a/base/benchmarks/main_memory-bandwidth/README.md b/base/benchmarks/main_memory-bandwidth/README.md
@@ -0,0 +1,27 @@
+# 评测原理
+
+1. 使用memory-bound的copy_方法来评测芯片主存储带宽
+2. 此方法仅进行数据复制，不对torch中的梯度等进行处理。同时仅将数据值进行复制，不包含创建张量等操作，适用于评测芯片主存储带宽
+
+# 适配修改规范
+
+本评测样例配置文件如下：
+
+```yaml
+Melements: 1024
+WARMUP: 100
+ITERS: 100000
+DIST_BACKEND: "mpi"
+```
+
+1. Melements为复制的fp32元素个数。厂商可在正整数范围内任意调整此项配置，发挥自身能力
+
+   例如，英伟达A100-40-SXM芯片采用Melements=1024
+
+2. WARMUP为预热所需迭代次数。厂商可在正整数范围内任意调整此值。WARMUP迭代部分不计入性能计算
+
+3. ITERS为正式评测迭代次数。厂商可在正整数范围内调整此值，同时保证总运行时间大于等于6分钟
+
+4. DIST_BACKEND为通讯库。在本评测样例中，仅供初始化使用，无通信算子。厂商可任意调整为适用于自己的通讯库
+
+   例如，英伟达A100-40-SXM芯片采用DIST_BACKEND="nccl"
diff --git a/base/benchmarks/main_memory-bandwidth/case_config.yaml b/base/benchmarks/main_memory-bandwidth/case_config.yaml
@@ -0,0 +1,4 @@
+Melements: 1024
+WARMUP: 100
+ITERS: 100000
+DIST_BACKEND: "mpi"
diff --git a/base/benchmarks/main_memory-bandwidth/main.py b/base/benchmarks/main_memory-bandwidth/main.py
@@ -0,0 +1,97 @@
+import torch
+import torch.distributed as dist
+import os
+import time
+from argparse import ArgumentParser, Namespace
+import yaml
+import sys
+sys.path.append("..")
+from drivers.utils import *
+
+
+def parse_args():
+    parser = ArgumentParser(description=" ")
+
+    parser.add_argument("--vendor",
+                        type=str,
+                        required=True,
+                        help="vendor name like nvidia")
+
+    parser.add_argument("--node_size",
+                        type=int,
+                        required=True,
+                        help="for pytorch")
+
+    args, unknown_args = parser.parse_known_args()
+    args.unknown_args = unknown_args
+    return args
+
+
+def main(config, case_config, rank, world_size, local_rank):    
+    set_ieee_float32(config.vendor)
+    if rank == 0:
+        print("finish initialization")
+
+    Melements = case_config.Melements
+    torchsize = (Melements, 1024, 1024)
+    tensor = torch.rand(torchsize, dtype=torch.float32).to(local_rank)
+
+
+    host_device_sync(config.vendor)
+    multi_device_sync(config.vendor)
+    if rank == 0:
+        print("start warmup")
+
+    for _ in range(case_config.WARMUP):
+        _tensor = tensor.clone()
+
+
+    host_device_sync(config.vendor)
+    multi_device_sync(config.vendor)
+    start_time = time.perf_counter()
+
+    for _ in range(case_config.ITERS):
+        _tensor = tensor.clone()
+    torch.cuda.synchronize()
+
+
+    host_device_sync(config.vendor)
+    multi_device_sync(config.vendor)
+    end_time = time.perf_counter()
+
+    elapsed_time = end_time - start_time
+
+
+    datasize = case_config.ITERS * 2 * (Melements * 1024 * 1024 * 4 / 1E9)
+    bandwidth = datasize / elapsed_time
+    bandwidth_gib = bandwidth * 1E9 / (1024**3)
+
+    return round(bandwidth, 2), round(bandwidth_gib, 2)
+
+
+if __name__ == "__main__":    
+    config = parse_args()
+    with open("case_config.yaml", "r") as file:
+        case_config = yaml.safe_load(file)
+    with open(os.path.join(config.vendor, "case_config.yaml"), "r") as file:
+        case_config_vendor = yaml.safe_load(file)
+    case_config.update(case_config_vendor)
+    case_config = Namespace(**case_config)
+
+    dist.init_process_group(backend=case_config.DIST_BACKEND)  
+    rank = dist.get_rank()
+    world_size = dist.get_world_size()
+    local_rank = rank % config.node_size
+
+    gb, gib = main(config, case_config, rank, world_size, local_rank)
+
+    multi_device_sync(config.vendor)
+    for output_rank in range(config.node_size):
+        if local_rank == output_rank:
+            print(r"[FlagPerf Result]Rank {}'s main_memory-bindwidth=".format(dist.get_rank()) + str(gb) + "GB/s")
+            print(r"[FlagPerf Result]Rank {}'s main_memory-bindwidth=".format(dist.get_rank()) + str(gib) + "GiB/s")
+        multi_device_sync(config.vendor)
+
+    dist.destroy_process_group()
+
+
diff --git a/base/benchmarks/main_memory-bandwidth/nvidia/README.md b/base/benchmarks/main_memory-bandwidth/nvidia/README.md
@@ -0,0 +1,42 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+
+* 单服务器内使用卡数：8
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测项不涉及服务期间AI芯片直连
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 主存储带宽测试值(8卡平均) | 主存储带宽标定值(8卡平均) | 测试标定比例(8卡平均) |
+| ---- | -------------- | -------------- | ------------ |
+| 评测结果 | 1328.25GB/s    | 1555GB/s       | 85.4%        |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗(8卡平均) | 单卡最大功耗(8卡最大) | 单卡功耗标准差(8卡最大) | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| 监控结果 | 3120.0W | 3120.0W | 0.0W    | /     | 239.3W       | 266.0W       | 44.58W        | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度(8卡平均) | 单卡平均显存占用(8卡平均) |
+| ---- | --------- | -------- | ------------ | -------------- |
+| 监控结果 | 3.334%    | 1.496%   | 53.29°C      | 33.444%        |
diff --git a/base/benchmarks/main_memory-bandwidth/nvidia/case_config.yaml b/base/benchmarks/main_memory-bandwidth/nvidia/case_config.yaml
@@ -0,0 +1 @@
+DIST_BACKEND: "nccl"
diff --git a/base/benchmarks/main_memory-bandwidth/nvidia/env.sh b/base/benchmarks/main_memory-bandwidth/nvidia/env.sh
@@ -0,0 +1 @@
+echo "NVIDIA PLACEHOLDER ENV.SH"
diff --git a/base/benchmarks/main_memory-bandwidth/nvidia/requirements.txt b/base/benchmarks/main_memory-bandwidth/nvidia/requirements.txt
@@ -0,0 +1 @@
+loguru
diff --git a/base/configs/host.yaml b/base/configs/host.yaml
@@ -25,9 +25,10 @@ ACCE_VISIBLE_DEVICE_ENV_NAME: "CUDA_VISIBLE_DEVICES"
 CASES: 
     "computation-FP32": "pytorch_2.3"
 
+# nvidia "computation-FP64":"pytorch_2.3"
 # nvidia "computation-FP32":"pytorch_2.3"
 # nvidia "computation-TF32":"pytorch_2.3"
 # nvidia "computation-FP16":"pytorch_2.3"
-# nvidia "computation-FP64":"pytorch_2.3"
 # nvidia "computation-BF16":"pytorch_2.3"
-# nvidia "computation-INT8":"pytorch_2.3"
+# nvidia "computation-INT8":"pytorch_2.3"
+# nvidia "main_memory-bandwidth":"pytorch_2.3"
diff --git a/base/toolkits/main_memory-bandwidth/nvidia/README.md b/base/toolkits/main_memory-bandwidth/nvidia/README.md
@@ -0,0 +1,42 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+* 产品名称：A100
+* 产品型号：A100-40GiB-SXM
+* TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数：1
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.4 LTS
+* 操作系统内核：linux5.4.0-113
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.16
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测样例无需服务器间通信
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 主存储带宽测试值    | 主存储带宽标定值 | 测试标定比例 |
+| ---- | ----------- | -------- | ------ |
+| 评测结果 | 1329.57GB/s | 1555GB/s | 85.5%  |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗  | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------- | ------ | ------- | ----- |
+| 监控结果 | 1560.0W | 1560.0W | 0.0W    | /     | 168.38W | 178.0W | 29.3W   | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度  | 单卡平均显存占用 |
+| ---- | --------- | -------- | ------- | -------- |
+| 监控结果 | 0.649%    | 1.007%   | 41.95°C | 76.018%  |
+
+# 厂商测试工具原理说明
+
+使用cudaMemcpy，进行读+写AI芯片主存储操作，计算AI芯片主存储带宽
diff --git a/base/toolkits/main_memory-bandwidth/nvidia/bandwidth.cu b/base/toolkits/main_memory-bandwidth/nvidia/bandwidth.cu
@@ -0,0 +1,53 @@
+#include <stdio.h>
+#include <cuda_runtime.h>
+
+#define GB (1024ULL * 1024ULL * 1024ULL)
+#define SIZE (16ULL * GB)
+#define WARMUP_ITERATIONS 100
+#define ITERATIONS 10000
+
+void checkCudaError(cudaError_t err, const char *msg) {
+    if (err != cudaSuccess) {
+        fprintf(stderr, "CUDA Error: %s: %s\n", msg, cudaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() {
+    float *d_src, *d_dst;
+    cudaEvent_t start, end;
+    float elapsed_time;
+
+    checkCudaError(cudaMalloc(&d_src, SIZE), "cudaMalloc");
+    checkCudaError(cudaMalloc(&d_dst, SIZE), "cudaMalloc");
+
+    checkCudaError(cudaEventCreate(&start), "cudaEventCreate");
+    checkCudaError(cudaEventCreate(&end), "cudaEventCreate");
+
+    for (int i = 0; i < WARMUP_ITERATIONS; ++i) {
+        checkCudaError(cudaMemcpy(d_dst, d_src, SIZE, cudaMemcpyDeviceToDevice), "cudaMemcpy");
+    }
+
+    checkCudaError(cudaEventRecord(start), "cudaEventRecord");
+
+    for (int i = 0; i < ITERATIONS; ++i) {
+        checkCudaError(cudaMemcpy(d_dst, d_src, SIZE, cudaMemcpyDeviceToDevice), "cudaMemcpy");
+    }
+
+    checkCudaError(cudaEventRecord(end), "cudaEventRecord");
+    checkCudaError(cudaEventSynchronize(end), "cudaEventSynchronize");
+
+    checkCudaError(cudaEventElapsedTime(&elapsed_time, start, end), "cudaEventElapsedTime");
+
+    double bandwidth = 2.0 * SIZE * ITERATIONS / (elapsed_time / 1000.0);
+
+    printf("[FlagPerf Result]main_memory-bandwidth=%.2fGiB/s\n", bandwidth / (1024.0 * 1024.0 * 1024.0));
+    printf("[FlagPerf Result]main_memory-bandwidth=%.2fGB/s\n", bandwidth / (1000.0 * 1000.0 * 1000.0));
+
+    checkCudaError(cudaFree(d_src), "cudaFree");
+    checkCudaError(cudaFree(d_dst), "cudaFree");
+    checkCudaError(cudaEventDestroy(start), "cudaEventDestroy");
+    checkCudaError(cudaEventDestroy(end), "cudaEventDestroy");
+
+    return 0;
+}
diff --git a/base/toolkits/main_memory-bandwidth/nvidia/main.sh b/base/toolkits/main_memory-bandwidth/nvidia/main.sh
@@ -0,0 +1,2 @@
+nvcc bandwidth.cu -o bdtest
+./bdtest