Revamp benchmarks

CliMA · Oct 3, 2024 · 8ebc0c4 · 8ebc0c4
1 parent 4e08f90
commit 8ebc0c4
Show file tree

Hide file tree

Showing 8 changed files with 597 additions and 150 deletions.
diff --git a/flame/flame.jl b/flame/flame.jl
@@ -7,8 +7,9 @@ include(joinpath(pkgdir(MBF), "test", "execution", "utils.jl"))
 
 # ===========================================
 
-has_cuda = CUDA.has_cuda()
-AType = has_cuda ? CUDA.CuArray : Array
+@static get(ENV, "USE_CUDA", nothing) == "true" && using CUDA
+use_cuda = @isdefined(CUDA) && CUDA.has_cuda() # will be true if you first run `using CUDA`
+AType = use_cuda ? CUDA.CuArray : Array
 # arr_size = (prod((50,5,5,6,50)),)
 arr_size = (50, 5, 5, 6, 50)
 X = get_arrays(:x, arr_size, AType)

diff --git a/test/execution/bm_fused_reads_vs_hard_coded.jl b/test/execution/bm_fused_reads_vs_hard_coded.jl
@@ -1,7 +1,11 @@
 #=
 using Revise; include(joinpath("test", "execution", "bm_fused_reads_vs_hard_coded.jl"))
 =#
-include("utils.jl")
+include("utils_test.jl")
+include("utils_setup.jl")
+include("utils_benchmark.jl")
+
+import MultiBroadcastFusion as MBF
 
 # =========================================== hard-coded implementations
 perf_kernel_hard_coded!(X, Y) = perf_kernel_hard_coded!(X, Y, MBF.device(X.x1))
@@ -19,45 +23,43 @@ function perf_kernel_hard_coded!(X, Y, ::MBF.CPU)
         y7[i] = x7[i] + x8[i] + x9[i] + x10[i]
     end
 end
-function perf_kernel_hard_coded!(X, Y, ::MBF.GPU)
-    x1 = X.x1
-    nitems = length(parent(x1))
-    max_threads = 256 # can be higher if conditions permit
-    nthreads = min(max_threads, nitems)
-    nblocks = cld(nitems, nthreads)
-    CUDA.@cuda threads = (nthreads) blocks = (nblocks) knl_multi_copyto_hard_coded!(
-        X,
-        Y,
-        Val(nitems),
-    )
-end
-function knl_multi_copyto_hard_coded!(X, Y, ::Val{nitems}) where {nitems}
-    (; x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) = X
-    (; y1, y2, y3, y4, y5, y6, y7, y8, y9, y10) = Y
-    idx = CUDA.threadIdx().x + (CUDA.blockIdx().x - 1) * CUDA.blockDim().x
-    @inbounds begin
-        if idx ≤ nitems
-            y1[idx] = x1[idx] + x2[idx] + x3[idx] + x4[idx]
-            y2[idx] = x2[idx] + x3[idx] + x4[idx] + x5[idx]
-            y3[idx] = x3[idx] + x4[idx] + x5[idx] + x6[idx]
-            y4[idx] = x4[idx] + x5[idx] + x6[idx] + x7[idx]
-            y5[idx] = x5[idx] + x6[idx] + x7[idx] + x8[idx]
-            y6[idx] = x6[idx] + x7[idx] + x8[idx] + x9[idx]
-            y7[idx] = x7[idx] + x8[idx] + x9[idx] + x10[idx]
+
+@static get(ENV, "USE_CUDA", nothing) == "true" && using CUDA
+use_cuda = @isdefined(CUDA) && CUDA.has_cuda() # will be true if you first run `using CUDA`
+@static if use_cuda
+    function perf_kernel_hard_coded!(X, Y, ::MBF.GPU)
+        x1 = X.x1
+        nitems = length(parent(x1))
+        max_threads = 256 # can be higher if conditions permit
+        nthreads = min(max_threads, nitems)
+        nblocks = cld(nitems, nthreads)
+        CUDA.@cuda threads = (nthreads) blocks = (nblocks) knl_multi_copyto_hard_coded!(
+            X,
+            Y,
+            Val(nitems),
+        )
+    end
+    function knl_multi_copyto_hard_coded!(X, Y, ::Val{nitems}) where {nitems}
+        (; x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) = X
+        (; y1, y2, y3, y4, y5, y6, y7, y8, y9, y10) = Y
+        idx = CUDA.threadIdx().x + (CUDA.blockIdx().x - 1) * CUDA.blockDim().x
+        @inbounds begin
+            if idx ≤ nitems
+                y1[idx] = x1[idx] + x2[idx] + x3[idx] + x4[idx]
+                y2[idx] = x2[idx] + x3[idx] + x4[idx] + x5[idx]
+                y3[idx] = x3[idx] + x4[idx] + x5[idx] + x6[idx]
+                y4[idx] = x4[idx] + x5[idx] + x6[idx] + x7[idx]
+                y5[idx] = x5[idx] + x6[idx] + x7[idx] + x8[idx]
+                y6[idx] = x6[idx] + x7[idx] + x8[idx] + x9[idx]
+                y7[idx] = x7[idx] + x8[idx] + x9[idx] + x10[idx]
+            end
         end
+        return nothing
     end
-    return nothing
 end
 
 # ===========================================
 
-has_cuda = CUDA.has_cuda()
-AType = has_cuda ? CUDA.CuArray : Array
-arr_size = (prod((50, 5, 5, 6, 50)),)
-# arr_size = (50,5,5,6,50)
-X = get_arrays(:x, arr_size, AType);
-Y = get_arrays(:y, arr_size, AType);
-
 function perf_kernel_unfused!(X, Y)
     (; x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) = X
     (; y1, y2, y3, y4, y5, y6, y7, y8, y9, y10) = Y
@@ -87,27 +89,101 @@ function perf_kernel_fused!(X, Y)
     end
 end
 
+@static get(ENV, "USE_CUDA", nothing) == "true" && using CUDA
+use_cuda = @isdefined(CUDA) && CUDA.has_cuda() # will be true if you first run `using CUDA`
+AType = use_cuda ? CUDA.CuArray : Array
+device_name = use_cuda ? CUDA.name(CUDA.device()) : "CPU"
+bm = Benchmark(; device_name, float_type = Float32)
+
+problem_size = (50, 5, 5, 6, 50)
+
+array_size = problem_size # array
+X = get_arrays(:x, AType, bm.float_type, array_size)
+Y = get_arrays(:y, AType, bm.float_type, array_size)
 test_kernel!(;
     fused! = perf_kernel_fused!,
     unfused! = perf_kernel_unfused!,
     X,
     Y,
 )
+use_cuda && test_kernel!(;
+    fused! = perf_kernel_hard_coded!,
+    unfused! = perf_kernel_unfused!,
+    X,
+    Y,
+)
+push_benchmark!(
+    bm,
+    use_cuda,
+    perf_kernel_unfused!,
+    X,
+    Y;
+    n_reads_writes = 7 + 10,
+    problem_size = array_size,
+)
+push_benchmark!(
+    bm,
+    use_cuda,
+    perf_kernel_fused!,
+    X,
+    Y;
+    n_reads_writes = 7 + 10,
+    problem_size = array_size,
+)
+use_cuda && push_benchmark!(
+    bm,
+    use_cuda,
+    perf_kernel_hard_coded!,
+    X,
+    Y;
+    n_reads_writes = 7 + 10,
+    problem_size = array_size,
+)
+
+array_size = (prod(problem_size),) # vector
+X = get_arrays(:x, AType, bm.float_type, array_size)
+Y = get_arrays(:y, AType, bm.float_type, array_size)
 test_kernel!(;
+    fused! = perf_kernel_fused!,
+    unfused! = perf_kernel_unfused!,
+    X,
+    Y,
+)
+use_cuda && test_kernel!(;
     fused! = perf_kernel_hard_coded!,
     unfused! = perf_kernel_unfused!,
     X,
     Y,
 )
+push_benchmark!(
+    bm,
+    use_cuda,
+    perf_kernel_unfused!,
+    X,
+    Y;
+    n_reads_writes = 7 + 10,
+    problem_size = array_size,
+)
+push_benchmark!(
+    bm,
+    use_cuda,
+    perf_kernel_fused!,
+    X,
+    Y;
+    n_reads_writes = 7 + 10,
+    problem_size = array_size,
+)
+use_cuda && push_benchmark!(
+    bm,
+    use_cuda,
+    perf_kernel_hard_coded!,
+    X,
+    Y;
+    n_reads_writes = 7 + 10,
+    problem_size = array_size,
+)
 
-# Compile
-perf_kernel_unfused!(X, Y)
-perf_kernel_fused!(X, Y)
-perf_kernel_hard_coded!(X, Y)
 
-# Benchmark
-benchmark_kernel!(perf_kernel_unfused!, X, Y)
-benchmark_kernel!(perf_kernel_fused!, X, Y)
-benchmark_kernel!(perf_kernel_hard_coded!, X, Y)
+tabulate_benchmark(bm)
 
 nothing
diff --git a/test/execution/bm_fused_shared_reads.jl b/test/execution/bm_fused_shared_reads.jl
@@ -2,7 +2,11 @@
 using Revise; include(joinpath("test", "execution", "bm_fused_shared_reads.jl"))
 =#
 
-include("utils.jl")
+include("utils_test.jl")
+include("utils_setup.jl")
+include("utils_benchmark.jl")
+
+import MultiBroadcastFusion as MBF
 
 function perf_kernel_shared_reads_unfused!(X, Y)
     (; x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) = X
@@ -30,24 +34,69 @@ function perf_kernel_shared_reads_fused!(X, Y)
     end
 end
 
-has_cuda = CUDA.has_cuda()
-AType = has_cuda ? CUDA.CuArray : Array
-arr_size = (prod((50, 5, 5, 6, 50)),)
-X = get_arrays(:x, arr_size, AType)
-Y = get_arrays(:y, arr_size, AType)
+@static get(ENV, "USE_CUDA", nothing) == "true" && using CUDA
+use_cuda = @isdefined(CUDA) && CUDA.has_cuda() # will be true if you first run `using CUDA`
+AType = use_cuda ? CUDA.CuArray : Array
+device_name = use_cuda ? CUDA.name(CUDA.device()) : "CPU"
+bm = Benchmark(; device_name, float_type = Float32)
+problem_size = (50, 5, 5, 6, 50)
+
+array_size = problem_size # array
+X = get_arrays(:x, AType, bm.float_type, array_size)
+Y = get_arrays(:y, AType, bm.float_type, array_size)
+test_kernel!(;
+    fused! = perf_kernel_shared_reads_fused!,
+    unfused! = perf_kernel_shared_reads_unfused!,
+    X,
+    Y,
+)
+push_benchmark!(
+    bm,
+    use_cuda,
+    perf_kernel_shared_reads_unfused!,
+    X,
+    Y;
+    n_reads_writes = 7 + 10,
+    problem_size = array_size,
+)
+push_benchmark!(
+    bm,
+    use_cuda,
+    perf_kernel_shared_reads_fused!,
+    X,
+    Y;
+    n_reads_writes = 7 + 10,
+    problem_size = array_size,
+)
 
+array_size = (prod(problem_size),) # vector
+X = get_arrays(:x, AType, bm.float_type, array_size)
+Y = get_arrays(:y, AType, bm.float_type, array_size)
 test_kernel!(;
     fused! = perf_kernel_shared_reads_fused!,
     unfused! = perf_kernel_shared_reads_unfused!,
     X,
     Y,
 )
-# Compile
-perf_kernel_shared_reads_unfused!(X, Y)
-perf_kernel_shared_reads_fused!(X, Y)
+push_benchmark!(
+    bm,
+    use_cuda,
+    perf_kernel_shared_reads_unfused!,
+    X,
+    Y;
+    n_reads_writes = 7 + 10,
+    problem_size = array_size,
+)
+push_benchmark!(
+    bm,
+    use_cuda,
+    perf_kernel_shared_reads_fused!,
+    X,
+    Y;
+    n_reads_writes = 7 + 10,
+    problem_size = array_size,
+)
 
-# Benchmark
-benchmark_kernel!(perf_kernel_shared_reads_unfused!, X, Y)
-benchmark_kernel!(perf_kernel_shared_reads_fused!, X, Y)
+tabulate_benchmark(bm)
 
 nothing