From b67cee6a304949ce426240f5d236d3cba4c2a4f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mos=C3=A8=20Giordano?= Date: Sat, 13 Jan 2024 17:46:29 +0000 Subject: [PATCH] Support Nvidia Hopper GPUs --- Project.toml | 2 +- ext/CUDAExt/implementations/peakflops_gpu.jl | 32 +++++++++++++++----- ext/CUDAExt/peakflops_gpu_wmmas.jl | 4 +-- 3 files changed, 28 insertions(+), 10 deletions(-) diff --git a/Project.toml b/Project.toml index 053af89..39c4f87 100644 --- a/Project.toml +++ b/Project.toml @@ -28,7 +28,7 @@ CUDAExt = "CUDA" CairoMakieExt = "CairoMakie" [compat] -CUDA = "3.8.4, 3.12, 4.4" +CUDA = "3.8.4, 3.12, 4.4, 5" CairoMakie = "0.7, 0.10.7" CpuId = "0.3" DocStringExtensions = "0.9" diff --git a/ext/CUDAExt/implementations/peakflops_gpu.jl b/ext/CUDAExt/implementations/peakflops_gpu.jl index 0d6bb2e..7bd306f 100644 --- a/ext/CUDAExt/implementations/peakflops_gpu.jl +++ b/ext/CUDAExt/implementations/peakflops_gpu.jl @@ -51,7 +51,7 @@ function _theoretical_peakflops_gpu_cudacores(; device, dtype) elseif dtype == Float64 max_peakflops *= 1 else - throw(ArgumentError("Unsupported dtype.")) + throw(ArgumentError("Unsupported dtype $(dtype).")) end return max_peakflops end @@ -60,7 +60,9 @@ function _theoretical_peakflops_gpu_tensorcores(; device=CUDA.device(), dtype=Float16, verbose=true ) cap = CUDA.capability(device) - if cap == v"8.0.0" + if cap == v"9.0.0" + devtype = :Hopper + elseif cap == v"8.0.0" devtype = :A100 elseif cap == v"7.0.0" devtype = :V100 @@ -70,10 +72,26 @@ function _theoretical_peakflops_gpu_tensorcores(; max_clock_rate = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_CLOCK_RATE) # in kHz num_tensor_cores = ntensorcores(device) max_peakflops = max_clock_rate * num_tensor_cores * 1e-9 # in TFLOP/s - if devtype == :A100 + if devtype == :Hopper + # matrix dimensions 8x8x4, factor 2 for nflops in A*B+C see + # * (figures 10-11) + # * (figures 5-8) if Symbol(dtype) == :Float16 - # matrix dimensions 8x8x4, factor 2 for nflops in A*B+C - # see e.g. https://peerj.com/articles/cs-330.pdf + max_peakflops *= 2 * 16 * 8 * 4 # XXX: Wrong result! + elseif Symbol(dtype) in (:Float32, :TensorFloat32, :TF32) + max_peakflops *= 2 * 8 * 8 * 4 # XXX: Wrong result! + elseif Symbol(dtype) == :Float64 + max_peakflops *= 2 * 4 * 4 * 2 + elseif Symbol(dtype) == :Int8 + max_peakflops *= 2 * 2 * 32 * 8 * 4 # XXX: Wrong result! + else + throw(ArgumentError("Unsupported dtype $(dtype).")) + end + elseif devtype == :A100 + if Symbol(dtype) == :Float16 + # matrix dimensions 8x8x4, factor 2 for nflops in A*B+C see + # e.g. or + # max_peakflops *= 2 * 8 * 8 * 4 elseif Symbol(dtype) in (:Float32, :TensorFloat32, :TF32) max_peakflops *= 2 * 4 * 8 * 4 @@ -82,13 +100,13 @@ function _theoretical_peakflops_gpu_tensorcores(; elseif Symbol(dtype) == :Int8 max_peakflops *= 2 * 2 * 8 * 8 * 4 else - throw(ArgumentError("Unsupported dtype.")) + throw(ArgumentError("Unsupported dtype $(dtype).")) end elseif devtype == :V100 if Symbol(dtype) == :Float16 max_peakflops *= 2 * 4 * 4 * 4 else - throw(ArgumentError("Unsupported dtype.")) + throw(ArgumentError("Unsupported dtype $(dtype).")) end end return max_peakflops diff --git a/ext/CUDAExt/peakflops_gpu_wmmas.jl b/ext/CUDAExt/peakflops_gpu_wmmas.jl index a12295b..3af1974 100644 --- a/ext/CUDAExt/peakflops_gpu_wmmas.jl +++ b/ext/CUDAExt/peakflops_gpu_wmmas.jl @@ -147,7 +147,7 @@ function _peakflops_gpu_wmmas(; dtype_a = dtype_b = BFloat16 dtype_c = dtype_d = Float32 else - throw(ArgumentError("Unsupported dtype.")) + throw(ArgumentError("Unsupported dtype $(dtype).")) end d_a = CUDA.rand(dtype_a, m, k) d_b = CUDA.rand(dtype_b, k, n) @@ -165,7 +165,7 @@ function _peakflops_gpu_wmmas(; elseif Symbol(dtype) in (:BFloat16, :BF16) kernel = @cuda launch = false _kernel_wmma_bf16_lowlevel(d_a, d_b, d_c, d_d) else - throw(ArgumentError("Unsupported dtype.")) + throw(ArgumentError("Unsupported dtype $(dtype).")) end warpsize = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_WARP_SIZE) # @show threads