From 4e2b2e877301bbb901f8b1e43617860e9a34595d Mon Sep 17 00:00:00 2001 From: Alejandro Acosta Date: Tue, 21 May 2024 16:58:50 +0100 Subject: [PATCH] Add bfloat-bfloat example --- benchmarks/CMakeLists.txt | 3 --- benchmarks/ampere/CMakeLists.txt | 5 +++++ .../bench_ampere_gemm_bf16_bf16_bf16_tensor_op_fp32.cu | 0 benchmarks/common/benchmark_runner.hpp | 9 +++++++++ 4 files changed, 14 insertions(+), 3 deletions(-) rename examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cu => benchmarks/ampere/bench_ampere_gemm_bf16_bf16_bf16_tensor_op_fp32.cu (100%) diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 64af6dba72..5e82a856c7 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -71,6 +71,3 @@ if(SYCL_INTEL_TARGET) else(SYCL_NVIDIA_TARGET OR NOT CUTLASS_ENABLE_SYCL) add_subdirectory(ampere) endif() -if(SYCL_NVIDIA_TARGET OR NOT CUTLASS_ENABLE_SYCL) - add_subdirectory(ampere) -endif() diff --git a/benchmarks/ampere/CMakeLists.txt b/benchmarks/ampere/CMakeLists.txt index a77901594b..32cf56b157 100644 --- a/benchmarks/ampere/CMakeLists.txt +++ b/benchmarks/ampere/CMakeLists.txt @@ -36,3 +36,8 @@ cutlass_benchmark_add_executable( bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32 bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32.cu ) + +cutlass_benchmark_add_executable( + bench_ampere_gemm_bf16_bf16_bf16_tensor_op_fp32 + bench_ampere_gemm_bf16_bf16_bf16_tensor_op_fp32.cu +) diff --git a/examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cu b/benchmarks/ampere/bench_ampere_gemm_bf16_bf16_bf16_tensor_op_fp32.cu similarity index 100% rename from examples/sycl/ampere/ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cu rename to benchmarks/ampere/bench_ampere_gemm_bf16_bf16_bf16_tensor_op_fp32.cu diff --git a/benchmarks/common/benchmark_runner.hpp b/benchmarks/common/benchmark_runner.hpp index 3d73ab63fb..61a90ca378 100644 --- a/benchmarks/common/benchmark_runner.hpp +++ b/benchmarks/common/benchmark_runner.hpp @@ -155,6 +155,15 @@ struct BenchmarkRunner { cutlass::DeviceAllocation block_D; cutlass::DeviceAllocation block_ref_D; + ElementOutput epsilon; + ElementOutput nonzero_floor; + + ExampleRunner() : epsilon(static_cast(0.1f)), + nonzero_floor(static_cast(0.1f)) {}; + + ExampleRunner(ElementOutput epsilon, ElementOutput nonzeroFloor) : + epsilon(epsilon), nonzero_floor(nonzeroFloor) {} + // // Methods //