[BAAI] Benchmark (#496)

* update ignore * update basic-spec * add benchmark * fix * fix
FlagOpen · Apr 1, 2024 · a83d09c · a83d09c
1 parent b8797f2
commit a83d09c
Show file tree

Hide file tree

Showing 27 changed files with 2,599 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -6,4 +6,5 @@ __pycache__/
 .pytest_cache
 training/result/*
 inference/result/*
+base/result/*
 inference/onnxs/*
diff --git a/base/benchmarks/computation-FP32/README.md b/base/benchmarks/computation-FP32/README.md
@@ -0,0 +1,29 @@
+# 评测原理
+
+1. 使用computation-bound的算子GEMM来评测芯片FP32算力
+2. 此算子为计算密集型，在业界被广泛用于测试算力
+
+# 适配修改规范
+
+本评测样例配置文件如下：
+
+```yaml
+M: 4096
+N: 4096
+K: 4096
+WARMUP: 100
+ITERS: 10000
+DIST_BACKEND: "mpi"
+```
+
+1. M、N、K为GEMM算子的配置。本评测样例以[M,N]矩阵和[N,K]矩阵相乘作为计算内容。厂商可在正整数范围内任意调整此三项配置，发挥自身能力
+
+   例如，英伟达A800-80-SXM芯片采用M=8192、N=8192、K=8192
+
+2. WARMUP为预热所需迭代次数。厂商可在正整数范围内任意调整此值。WARMUP迭代部分不计入性能计算
+
+3. ITERS为正式评测迭代次数。厂商可在正整数范围内调整此值，同时保证总运行时间大于等于6分钟
+
+4. DIST_BACKEND为通讯库。在本评测样例中，仅供初始化使用，无通信算子。厂商可任意调整为适用于自己的通讯库
+
+   例如，英伟达A800-80-SXM芯片采用DIST_BACKEND="nccl"
diff --git a/base/benchmarks/computation-FP32/case_config.yaml b/base/benchmarks/computation-FP32/case_config.yaml
@@ -0,0 +1,6 @@
+M: 4096
+N: 4096
+K: 4096
+WARMUP: 100
+ITERS: 10000
+DIST_BACKEND: "mpi"
diff --git a/base/benchmarks/computation-FP32/main.py b/base/benchmarks/computation-FP32/main.py
@@ -0,0 +1,99 @@
+import torch
+import torch.distributed as dist
+import os
+import time
+from argparse import ArgumentParser, Namespace
+import yaml
+import sys
+sys.path.append("..")
+from drivers.utils import *
+
+
+def parse_args():
+    parser = ArgumentParser(description=" ")
+
+    parser.add_argument("--vendor",
+                        type=str,
+                        required=True,
+                        help="vendor name like nvidia")
+
+    parser.add_argument("--node_size",
+                        type=int,
+                        required=True,
+                        help="for pytorch")
+
+    args, unknown_args = parser.parse_known_args()
+    args.unknown_args = unknown_args
+    return args
+
+
+def main(config, case_config, rank, world_size, local_rank):    
+    set_ieee_float32(config.vendor)
+    if rank == 0:
+        print("finish initialization")
+
+    m = case_config.M
+    n = case_config.N
+    k = case_config.K
+
+
+    matrixA = torch.randn(m, n, dtype=torch.float32).to(local_rank)
+    matrixB = torch.randn(n, k, dtype=torch.float32).to(local_rank)
+
+    host_device_sync(config.vendor)
+    multi_device_sync(config.vendor)
+    if rank == 0:
+        print("start warmup")
+
+    for _ in range(case_config.WARMUP):
+        _result = torch.mm(matrixA, matrixB)
+
+    host_device_sync(config.vendor)
+    multi_device_sync(config.vendor)
+    if rank == 0:
+        print("start test")
+
+    host_device_sync(config.vendor)
+    multi_device_sync(config.vendor)
+    start_time = time.perf_counter()
+
+    for _ in range(case_config.ITERS):
+        _result = torch.mm(matrixA, matrixB)
+
+    host_device_sync(config.vendor)
+    multi_device_sync(config.vendor)
+    end_time = time.perf_counter()
+
+    exec_time = end_time - start_time
+
+    operations = case_config.ITERS * 2 * m * n * k
+    tflops = operations / exec_time / 1e12
+
+    return round(tflops, 2)
+
+
+if __name__ == "__main__":    
+    config = parse_args()
+    with open("case_config.yaml", "r") as file:
+        case_config = yaml.safe_load(file)
+    with open(os.path.join(config.vendor, "case_config.yaml"), "r") as file:
+        case_config_vendor = yaml.safe_load(file)
+    case_config.update(case_config_vendor)
+    case_config = Namespace(**case_config)
+
+    dist.init_process_group(backend=case_config.DIST_BACKEND)  
+    rank = dist.get_rank()
+    world_size = dist.get_world_size()
+    local_rank = rank % config.node_size
+
+    result = main(config, case_config, rank, world_size, local_rank)
+
+    multi_device_sync(config.vendor)
+    for output_rank in range(config.node_size):
+        if local_rank == output_rank:
+            print(r"[FlagPerf Result]Rank {}'s computation-FP32=".format(dist.get_rank()) + str(result) + "TFLOPS")
+        multi_device_sync(config.vendor)
+
+    dist.destroy_process_group()
+
+
diff --git a/base/benchmarks/computation-FP32/nvidia/README.md b/base/benchmarks/computation-FP32/nvidia/README.md
@@ -0,0 +1,61 @@
+# 参评AI芯片信息
+
+* 厂商：Nvidia
+
+## 服务器1
+
+* 产品名称：A800
+* 产品型号：A800-80GiB-SXM
+* TDP：400W
+
+## 服务器2
+
+- 产品名称：A100
+- 产品型号：A100-40GiB-SXM
+- TDP：400W
+
+# 所用服务器配置
+
+* 服务器数量：2
+
+## 服务器1
+
+* 单服务器内使用卡数：8
+* 服务器型号：DGX A100
+* 操作系统版本：Ubuntu 20.04.1 LTS
+* 操作系统内核：linux5.4.0-126
+* CPU：AMD EPYC7742-64core
+* docker版本：20.10.18
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：200Gb*2 IB
+
+## 服务器2
+
+- 单服务器内使用卡数：8
+- 服务器型号：DGX A100
+- 操作系统版本：Ubuntu 20.04.4 LTS
+- 操作系统内核：linux5.4.0-113
+- CPU：AMD EPYC7742-64core
+- docker版本：20.10.16
+- 内存：1TiB
+- 服务器间AI芯片直连规格及带宽：200Gb*2 IB
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | FP32算力测试值(16卡平均) | FP32算力标定值(16卡平均) | 测试标定比例(16卡平均) |
+| ---- | ---------------- | ---------------- | ------------- |
+| 评测结果 | 19.05TFLOPS      | 19.5TFLOPS       | 97.7%         |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗(2机平均) | 系统最大功耗(2机最大) | 系统功耗标准差(2机最大) | 单机TDP | 单卡平均功耗(16卡平均) | 单卡最大功耗(16卡最大) | 单卡功耗标准差(16卡最大) | 单卡TDP |
+| ---- | ------------ | ------------ | ------------- | ----- | ------------- | ------------- | -------------- | ----- |
+| 监控结果 | 3954.6W      | 3978.0W      | 38.21W        | /     | 317.2W        | 376.0W        | 83.7W          | 400W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用(2机平均) | 系统平均内存占用(2机平均) | 单卡平均温度(16卡平均) | 单卡平均显存占用(16卡平均) |
+| ---- | --------------- | -------------- | ------------- | --------------- |
+| 监控结果 | 3.456%          | 2.777%         | 60.12°C       | 3.667%          |
diff --git a/base/benchmarks/computation-FP32/nvidia/case_config.yaml b/base/benchmarks/computation-FP32/nvidia/case_config.yaml
@@ -0,0 +1,4 @@
+M: 8192
+N: 8192
+K: 8192
+DIST_BACKEND: "nccl"
diff --git a/base/benchmarks/computation-FP32/nvidia/env.sh b/base/benchmarks/computation-FP32/nvidia/env.sh
@@ -0,0 +1 @@
+echo "NVIDIA PLACEHOLDER ENV.SH"
diff --git a/base/benchmarks/computation-FP32/nvidia/requirements.txt b/base/benchmarks/computation-FP32/nvidia/requirements.txt
@@ -0,0 +1 @@
+loguru
diff --git a/base/benchmarks/drivers/__init__.py b/base/benchmarks/drivers/__init__.py
@@ -0,0 +1 @@
+from .utils import *
diff --git a/base/benchmarks/drivers/utils.py b/base/benchmarks/drivers/utils.py
@@ -0,0 +1,39 @@
+import torch
+
+
+def set_ieee_float32(vendor):
+    if vendor == "nvidia":
+        torch.backends.cuda.matmul.allow_tf32 = False
+    else:
+        print("unspecified vendor {}, do nothing".format(vendor))
+
+
+def unset_ieee_float32(vendor):
+    if vendor == "nvidia":
+        torch.backends.cuda.matmul.allow_tf32 = True
+    else:
+        print("unspecified vendor {}, do nothing".format(vendor))
+
+
+def host_device_sync(vendor):
+    if vendor == "nvidia":
+        torch.cuda.synchronize()
+    else:
+        print("unspecified vendor {}, using default pytorch \"torch.cuda.synchronize\"".format(vendor))
+        torch.cuda.synchronize()
+
+
+def multi_device_sync(vendor):
+    if vendor == "nvidia":
+        torch.distributed.barrier()
+    else:
+        print("unspecified vendor {}, using default pytorch \"torch.distributed.barrier\"".format(vendor))
+        torch.distributed.barrier()
+
+
+def get_memory_capacity(vendor, rank):
+    if vendor == "nvidia":
+        return torch.cuda.get_device_properties(rank).total_memory
+    else:
+        print("unspecified vendor {}, return -1.0".format(vendor))
+        return -1.0
diff --git a/base/configs/host.yaml b/base/configs/host.yaml
@@ -0,0 +1,26 @@
+FLAGPERF_PATH: "/home/FlagPerf/base"
+FLAGPERF_LOG_PATH: "result"
+VENDOR: "nvidia"
+FLAGPERF_LOG_LEVEL: "info"
+# "BENCHMARK" means benchmarks(torch), "TOOLKIT" means toolkits
+# benchmarks using container_main to launch "torchrun benchmarks/<case>/main.py", nnodes * nproc
+# toolkits using container_main to launch bash toolkits/<case>/<vendor>/main.sh, nnodes.
+# only in benchmarks, flagperf will automatically execute benchmakrs/<case>/<vendor>/requirements.txt and env.sh
+# all resources to be used in toolkits/<case>/<vendor>/main.sh, should be under toolkits/<case>/<vendor>/
+BENCHMARKS_OR_TOOLKITS: "BENCHMARK"
+HOSTS: ["192.168.1.2", "192.168.1.3"]
+NPROC_PER_NODE: 8
+SSH_PORT: "22"
+HOSTS_PORTS: ["2222"]
+MASTER_PORT: "29501"
+SHM_SIZE: "32G"
+ACCE_CONTAINER_OPT: " --gpus all"
+# for nvidia, using " -- gpus all"
+# for xxx, using
+PIP_SOURCE: "https://mirror.baidu.com/pypi/simple"
+CLEAR_CACHES: True
+# for nvidia, using "CUDA_VISIBLE_DEVICES"
+# for xxx, using
+ACCE_VISIBLE_DEVICE_ENV_NAME: "CUDA_VISIBLE_DEVICES"
+CASES: 
+    "computation-FP32": "pytorch_2.3"