From de471a312369271f9ba993bb0cf14cb5915292f3 Mon Sep 17 00:00:00 2001 From: Charles Kawczynski Date: Thu, 3 Oct 2024 14:50:30 -0400 Subject: [PATCH] Update benchmarks --- flame/flame.jl | 13 ++-- .../execution/bm_fused_reads_vs_hard_coded.jl | 78 ++++++++----------- test/execution/bm_fused_shared_reads.jl | 36 ++++----- .../execution/bm_fused_shared_reads_writes.jl | 57 ++++++-------- test/execution/utils_setup.jl | 2 +- test/execution/utils_test.jl | 25 +----- 6 files changed, 79 insertions(+), 132 deletions(-) diff --git a/flame/flame.jl b/flame/flame.jl index e745586..d6abca3 100644 --- a/flame/flame.jl +++ b/flame/flame.jl @@ -16,16 +16,13 @@ X = get_arrays(:x, arr_size, AType) Y = get_arrays(:y, arr_size, AType) function perf_kernel_fused!(X, Y) - (; x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) = X - (; y1, y2, y3, y4, y5, y6, y7, y8, y9, y10) = Y + (; x1, x2, x3, x4) = X + (; y1, y2, y3, y4) = Y @fused_direct begin @. y1 = x1 + x2 + x3 + x4 - @. y2 = x2 + x3 + x4 + x5 - @. y3 = x3 + x4 + x5 + x6 - @. y4 = x4 + x5 + x6 + x7 - @. y5 = x5 + x6 + x7 + x8 - @. y6 = x6 + x7 + x8 + x9 - @. y7 = x7 + x8 + x9 + x10 + @. y2 = x1 * x2 * x3 * x4 + @. y3 = x1 + x2 - x3 + x4 + @. y4 = x1 * x2 + x3 * x4 end end diff --git a/test/execution/bm_fused_reads_vs_hard_coded.jl b/test/execution/bm_fused_reads_vs_hard_coded.jl index ecdb219..4141567 100644 --- a/test/execution/bm_fused_reads_vs_hard_coded.jl +++ b/test/execution/bm_fused_reads_vs_hard_coded.jl @@ -11,16 +11,13 @@ import MultiBroadcastFusion as MBF perf_kernel_hard_coded!(X, Y) = perf_kernel_hard_coded!(X, Y, MBF.device(X.x1)) function perf_kernel_hard_coded!(X, Y, ::MBF.CPU) - (; x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) = X - (; y1, y2, y3, y4, y5, y6, y7, y8, y9, y10) = Y + (; x1, x2, x3, x4) = X + (; y1, y2, y3, y4) = Y @inbounds for i in eachindex(x1) y1[i] = x1[i] + x2[i] + x3[i] + x4[i] - y2[i] = x2[i] + x3[i] + x4[i] + x5[i] - y3[i] = x3[i] + x4[i] + x5[i] + x6[i] - y4[i] = x4[i] + x5[i] + x6[i] + x7[i] - y5[i] = x5[i] + x6[i] + x7[i] + x8[i] - y6[i] = x6[i] + x7[i] + x8[i] + x9[i] - y7[i] = x7[i] + x8[i] + x9[i] + x10[i] + y2[i] = x1[i] * x2[i] * x3[i] * x4[i] + y3[i] = x1[i] + x2[i] - x3[i] + x4[i] + y4[i] = x1[i] * x2[i] + x3[i] * x4[i] end end @@ -40,18 +37,17 @@ use_cuda = @isdefined(CUDA) && CUDA.has_cuda() # will be true if you first run ` ) end function knl_multi_copyto_hard_coded!(X, Y, ::Val{nitems}) where {nitems} - (; x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) = X - (; y1, y2, y3, y4, y5, y6, y7, y8, y9, y10) = Y - idx = CUDA.threadIdx().x + (CUDA.blockIdx().x - 1) * CUDA.blockDim().x + (; x1, x2, x3, x4) = X + (; y1, y2, y3, y4) = Y + i = + CUDA.threadIdx().x + + (CUDA.blockIdx().x - Int32(1)) * CUDA.blockDim().x @inbounds begin - if idx ≤ nitems - y1[idx] = x1[idx] + x2[idx] + x3[idx] + x4[idx] - y2[idx] = x2[idx] + x3[idx] + x4[idx] + x5[idx] - y3[idx] = x3[idx] + x4[idx] + x5[idx] + x6[idx] - y4[idx] = x4[idx] + x5[idx] + x6[idx] + x7[idx] - y5[idx] = x5[idx] + x6[idx] + x7[idx] + x8[idx] - y6[idx] = x6[idx] + x7[idx] + x8[idx] + x9[idx] - y7[idx] = x7[idx] + x8[idx] + x9[idx] + x10[idx] + if i ≤ nitems + y1[i] = x1[i] + x2[i] + x3[i] + x4[i] + y2[i] = x1[i] * x2[i] * x3[i] * x4[i] + y3[i] = x1[i] + x2[i] - x3[i] + x4[i] + y4[i] = x1[i] * x2[i] + x3[i] * x4[i] end end return nothing @@ -61,31 +57,25 @@ end # =========================================== function perf_kernel_unfused!(X, Y) - (; x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) = X - (; y1, y2, y3, y4, y5, y6, y7, y8, y9, y10) = Y - # 7 writes; 10 unique reads - # 7 writes; 28 reads including redundant ones + (; x1, x2, x3, x4) = X + (; y1, y2, y3, y4) = Y + # 4 writes; 4 unique reads + # 4 writes; 16 reads including redundant ones @. y1 = x1 + x2 + x3 + x4 - @. y2 = x2 + x3 + x4 + x5 - @. y3 = x3 + x4 + x5 + x6 - @. y4 = x4 + x5 + x6 + x7 - @. y5 = x5 + x6 + x7 + x8 - @. y6 = x6 + x7 + x8 + x9 - @. y7 = x7 + x8 + x9 + x10 + @. y2 = x1 * x2 * x3 * x4 + @. y3 = x1 + x2 - x3 + x4 + @. y4 = x1 * x2 + x3 * x4 return nothing end function perf_kernel_fused!(X, Y) - (; x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) = X - (; y1, y2, y3, y4, y5, y6, y7, y8, y9, y10) = Y + (; x1, x2, x3, x4) = X + (; y1, y2, y3, y4) = Y MBF.@fused_direct begin @. y1 = x1 + x2 + x3 + x4 - @. y2 = x2 + x3 + x4 + x5 - @. y3 = x3 + x4 + x5 + x6 - @. y4 = x4 + x5 + x6 + x7 - @. y5 = x5 + x6 + x7 + x8 - @. y6 = x6 + x7 + x8 + x9 - @. y7 = x7 + x8 + x9 + x10 + @. y2 = x1 * x2 * x3 * x4 + @. y3 = x1 + x2 - x3 + x4 + @. y4 = x1 * x2 + x3 * x4 end end @@ -95,7 +85,7 @@ AType = use_cuda ? CUDA.CuArray : Array device_name = use_cuda ? CUDA.name(CUDA.device()) : "CPU" bm = Benchmark(; device_name, float_type = Float32) -problem_size = (50, 5, 5, 6, 50) +problem_size = (50, 5, 5, 6, 5400) array_size = problem_size # array X = get_arrays(:x, AType, bm.float_type, array_size) @@ -118,7 +108,7 @@ push_benchmark!( perf_kernel_unfused!, X, Y; - n_reads_writes = 7 + 10, + n_reads_writes = 4 + 4, problem_size = array_size, ) push_benchmark!( @@ -127,7 +117,7 @@ push_benchmark!( perf_kernel_fused!, X, Y; - n_reads_writes = 7 + 10, + n_reads_writes = 4 + 4, problem_size = array_size, ) use_cuda && push_benchmark!( @@ -136,7 +126,7 @@ use_cuda && push_benchmark!( perf_kernel_hard_coded!, X, Y; - n_reads_writes = 7 + 10, + n_reads_writes = 4 + 4, problem_size = array_size, ) @@ -161,7 +151,7 @@ push_benchmark!( perf_kernel_unfused!, X, Y; - n_reads_writes = 7 + 10, + n_reads_writes = 4 + 4, problem_size = array_size, ) push_benchmark!( @@ -170,7 +160,7 @@ push_benchmark!( perf_kernel_fused!, X, Y; - n_reads_writes = 7 + 10, + n_reads_writes = 4 + 4, problem_size = array_size, ) use_cuda && push_benchmark!( @@ -179,7 +169,7 @@ use_cuda && push_benchmark!( perf_kernel_hard_coded!, X, Y; - n_reads_writes = 7 + 10, + n_reads_writes = 4 + 4, problem_size = array_size, ) diff --git a/test/execution/bm_fused_shared_reads.jl b/test/execution/bm_fused_shared_reads.jl index 8853821..b23f6fb 100644 --- a/test/execution/bm_fused_shared_reads.jl +++ b/test/execution/bm_fused_shared_reads.jl @@ -9,28 +9,22 @@ include("utils_benchmark.jl") import MultiBroadcastFusion as MBF function perf_kernel_shared_reads_unfused!(X, Y) - (; x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) = X - (; y1, y2, y3, y4, y5, y6, y7, y8, y9, y10) = Y + (; x1, x2, x3, x4) = X + (; y1, y2, y3, y4) = Y @. y1 = x1 + x2 + x3 + x4 - @. y2 = x2 + x3 + x4 + x5 - @. y3 = x3 + x4 + x5 + x6 - @. y4 = x4 + x5 + x6 + x7 - @. y5 = x5 + x6 + x7 + x8 - @. y6 = x6 + x7 + x8 + x9 - @. y7 = x7 + x8 + x9 + x10 + @. y2 = x1 * x2 * x3 * x4 + @. y3 = x1 + x2 - x3 + x4 + @. y4 = x1 * x2 + x3 * x4 end function perf_kernel_shared_reads_fused!(X, Y) - (; x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) = X - (; y1, y2, y3, y4, y5, y6, y7, y8, y9, y10) = Y + (; x1, x2, x3, x4) = X + (; y1, y2, y3, y4) = Y MBF.@fused_direct begin @. y1 = x1 + x2 + x3 + x4 - @. y2 = x2 + x3 + x4 + x5 - @. y3 = x3 + x4 + x5 + x6 - @. y4 = x4 + x5 + x6 + x7 - @. y5 = x5 + x6 + x7 + x8 - @. y6 = x6 + x7 + x8 + x9 - @. y7 = x7 + x8 + x9 + x10 + @. y2 = x1 * x2 * x3 * x4 + @. y3 = x1 + x2 - x3 + x4 + @. y4 = x1 * x2 + x3 * x4 end end @@ -39,7 +33,7 @@ use_cuda = @isdefined(CUDA) && CUDA.has_cuda() # will be true if you first run ` AType = use_cuda ? CUDA.CuArray : Array device_name = use_cuda ? CUDA.name(CUDA.device()) : "CPU" bm = Benchmark(; device_name, float_type = Float32) -problem_size = (50, 5, 5, 6, 50) +problem_size = (50, 5, 5, 6, 5400) array_size = problem_size # array X = get_arrays(:x, AType, bm.float_type, array_size) @@ -56,7 +50,7 @@ push_benchmark!( perf_kernel_shared_reads_unfused!, X, Y; - n_reads_writes = 7 + 10, + n_reads_writes = 4 + 4, problem_size = array_size, ) push_benchmark!( @@ -65,7 +59,7 @@ push_benchmark!( perf_kernel_shared_reads_fused!, X, Y; - n_reads_writes = 7 + 10, + n_reads_writes = 4 + 4, problem_size = array_size, ) @@ -84,7 +78,7 @@ push_benchmark!( perf_kernel_shared_reads_unfused!, X, Y; - n_reads_writes = 7 + 10, + n_reads_writes = 4 + 4, problem_size = array_size, ) push_benchmark!( @@ -93,7 +87,7 @@ push_benchmark!( perf_kernel_shared_reads_fused!, X, Y; - n_reads_writes = 7 + 10, + n_reads_writes = 4 + 4, problem_size = array_size, ) diff --git a/test/execution/bm_fused_shared_reads_writes.jl b/test/execution/bm_fused_shared_reads_writes.jl index 2df36ed..c3f8a61 100644 --- a/test/execution/bm_fused_shared_reads_writes.jl +++ b/test/execution/bm_fused_shared_reads_writes.jl @@ -9,35 +9,26 @@ include("utils_benchmark.jl") import MultiBroadcastFusion as MBF function perf_kernel_shared_reads_writes_unfused!(X, Y) - (; x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) = X - (; y1, y2, y3, y4, y5, y6, y7, y8, y9, y10) = Y - # Totoal: 10 writes, 15 reads, and 5 read/write overlaps - @. y1 = x1 + x6 - @. y2 = x2 + x7 - @. y3 = x3 + x8 - @. y4 = x4 + x9 - @. y5 = x5 + x10 - @. y6 = y1 - @. y7 = y2 - @. y8 = y3 - @. y9 = y4 - @. y10 = y5 + (; x1, x2, x3, x4) = X + (; y1, y2, y3, y4) = Y + # Total: 4 writes, 8 reads (including redundants) + # Theoretical minimum: 4 + 4 read/writes + @. y1 = x1 + x3 + @. y2 = x2 + x4 + @. y3 = y1 + x4 + @. y4 = y2 + y3 end function perf_kernel_shared_reads_writes_fused!(X, Y) - (; x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) = X - (; y1, y2, y3, y4, y5, y6, y7, y8, y9, y10) = Y + (; x1, x2, x3, x4) = X + (; y1, y2, y3, y4) = Y + # Total: 4 writes, 8 reads (including redundants) + # Theoretical minimum: 4 + 4 read/writes MBF.@fused_direct begin - @. y1 = x1 + x6 - @. y2 = x2 + x7 - @. y3 = x3 + x8 - @. y4 = x4 + x9 - @. y5 = x5 + x10 - @. y6 = y1 - @. y7 = y2 - @. y8 = y3 - @. y9 = y4 - @. y10 = y5 + @. y1 = x1 + x3 + @. y2 = x2 + x4 + @. y3 = y1 + x4 + @. y4 = y2 + y3 end end @@ -45,12 +36,8 @@ end use_cuda = @isdefined(CUDA) && CUDA.has_cuda() # will be true if you first run `using CUDA` AType = use_cuda ? CUDA.CuArray : Array device_name = use_cuda ? CUDA.name(CUDA.device()) : "CPU" -bm = Benchmark(; - problem_size = (prod((50, 5, 5, 6, 50)),), - device_name, - float_type = Float32, -) -problem_size = (50, 5, 5, 6, 50) +bm = Benchmark(; device_name, float_type = Float32) +problem_size = (50, 5, 5, 6, 5400) array_size = problem_size # array X = get_arrays(:x, AType, bm.float_type, array_size) @@ -68,7 +55,7 @@ push_benchmark!( perf_kernel_shared_reads_writes_unfused!, X, Y; - n_reads_writes = 10 + 15, + n_reads_writes = 4 + 4, problem_size = array_size, ) push_benchmark!( @@ -77,7 +64,7 @@ push_benchmark!( perf_kernel_shared_reads_writes_fused!, X, Y; - n_reads_writes = 10 + 15, + n_reads_writes = 4 + 4, problem_size = array_size, ) @@ -97,7 +84,7 @@ push_benchmark!( perf_kernel_shared_reads_writes_unfused!, X, Y; - n_reads_writes = 10 + 15, + n_reads_writes = 4 + 4, problem_size = array_size, ) push_benchmark!( @@ -106,7 +93,7 @@ push_benchmark!( perf_kernel_shared_reads_writes_fused!, X, Y; - n_reads_writes = 10 + 15, + n_reads_writes = 4 + 4, problem_size = array_size, ) diff --git a/test/execution/utils_setup.jl b/test/execution/utils_setup.jl index e2353da..ea63593 100644 --- a/test/execution/utils_setup.jl +++ b/test/execution/utils_setup.jl @@ -1,6 +1,6 @@ get_array(AType, FT, s) = AType(zeros(FT, s...)) -function get_arrays(sym, AType, FT, s, n = 10) +function get_arrays(sym, AType, FT, s, n = 4) println("array_type = $AType") fn = ntuple(i -> Symbol(sym, i), n) return (; zip(fn, ntuple(_ -> get_array(AType, FT, s), n))...) diff --git a/test/execution/utils_test.jl b/test/execution/utils_test.jl index 86d3d24..e9bc638 100644 --- a/test/execution/utils_test.jl +++ b/test/execution/utils_test.jl @@ -90,9 +90,9 @@ function test_kernel!(; fused!, unfused!, X, Y) y .= map(_ -> rand(), y) end X_fused = deepcopy(X) - X_unfused = deepcopy(X) + X_unfused = X Y_fused = deepcopy(Y) - Y_unfused = deepcopy(Y) + Y_unfused = Y fused!(X_fused, Y_fused) unfused!(X_unfused, Y_unfused) @testset "Test correctness of $(nameof(typeof(fused!)))" begin @@ -100,24 +100,3 @@ function test_kernel!(; fused!, unfused!, X, Y) test_compare(Y_fused, Y_unfused) end end -function test_kernel_args!(; fused!, unfused!, args) - (; X, Y) = args - for x in X - x .= rand(size(x)...) - end - for y in Y - y .= rand(size(y)...) - end - X_fused = deepcopy(X) - X_unfused = deepcopy(X) - Y_fused = deepcopy(Y) - Y_unfused = deepcopy(Y) - args_fused = (; X = X_fused, Y = Y_fused) - args_unfused = (; X = X_unfused, Y = Y_unfused) - fused!(args_fused) - unfused!(args_unfused) - @testset "Test correctness of $(nameof(typeof(fused!)))" begin - test_compare(X_fused, X_unfused) - test_compare(Y_fused, Y_unfused) - end -end