Skip to content

Commit

Permalink
Merge pull request #38 from CliMA/ck/update_bm
Browse files Browse the repository at this point in the history
Update benchmarks
  • Loading branch information
charleskawczynski authored Oct 4, 2024
2 parents f3064ba + de471a3 commit 86c3392
Show file tree
Hide file tree
Showing 6 changed files with 79 additions and 132 deletions.
13 changes: 5 additions & 8 deletions flame/flame.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,13 @@ X = get_arrays(:x, arr_size, AType)
Y = get_arrays(:y, arr_size, AType)

function perf_kernel_fused!(X, Y)
(; x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) = X
(; y1, y2, y3, y4, y5, y6, y7, y8, y9, y10) = Y
(; x1, x2, x3, x4) = X
(; y1, y2, y3, y4) = Y
@fused_direct begin
@. y1 = x1 + x2 + x3 + x4
@. y2 = x2 + x3 + x4 + x5
@. y3 = x3 + x4 + x5 + x6
@. y4 = x4 + x5 + x6 + x7
@. y5 = x5 + x6 + x7 + x8
@. y6 = x6 + x7 + x8 + x9
@. y7 = x7 + x8 + x9 + x10
@. y2 = x1 * x2 * x3 * x4
@. y3 = x1 + x2 - x3 + x4
@. y4 = x1 * x2 + x3 * x4
end
end

Expand Down
78 changes: 34 additions & 44 deletions test/execution/bm_fused_reads_vs_hard_coded.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,13 @@ import MultiBroadcastFusion as MBF
perf_kernel_hard_coded!(X, Y) = perf_kernel_hard_coded!(X, Y, MBF.device(X.x1))

function perf_kernel_hard_coded!(X, Y, ::MBF.CPU)
(; x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) = X
(; y1, y2, y3, y4, y5, y6, y7, y8, y9, y10) = Y
(; x1, x2, x3, x4) = X
(; y1, y2, y3, y4) = Y
@inbounds for i in eachindex(x1)
y1[i] = x1[i] + x2[i] + x3[i] + x4[i]
y2[i] = x2[i] + x3[i] + x4[i] + x5[i]
y3[i] = x3[i] + x4[i] + x5[i] + x6[i]
y4[i] = x4[i] + x5[i] + x6[i] + x7[i]
y5[i] = x5[i] + x6[i] + x7[i] + x8[i]
y6[i] = x6[i] + x7[i] + x8[i] + x9[i]
y7[i] = x7[i] + x8[i] + x9[i] + x10[i]
y2[i] = x1[i] * x2[i] * x3[i] * x4[i]
y3[i] = x1[i] + x2[i] - x3[i] + x4[i]
y4[i] = x1[i] * x2[i] + x3[i] * x4[i]
end
end

Expand All @@ -40,18 +37,17 @@ use_cuda = @isdefined(CUDA) && CUDA.has_cuda() # will be true if you first run `
)
end
function knl_multi_copyto_hard_coded!(X, Y, ::Val{nitems}) where {nitems}
(; x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) = X
(; y1, y2, y3, y4, y5, y6, y7, y8, y9, y10) = Y
idx = CUDA.threadIdx().x + (CUDA.blockIdx().x - 1) * CUDA.blockDim().x
(; x1, x2, x3, x4) = X
(; y1, y2, y3, y4) = Y
i =
CUDA.threadIdx().x +
(CUDA.blockIdx().x - Int32(1)) * CUDA.blockDim().x
@inbounds begin
if idx nitems
y1[idx] = x1[idx] + x2[idx] + x3[idx] + x4[idx]
y2[idx] = x2[idx] + x3[idx] + x4[idx] + x5[idx]
y3[idx] = x3[idx] + x4[idx] + x5[idx] + x6[idx]
y4[idx] = x4[idx] + x5[idx] + x6[idx] + x7[idx]
y5[idx] = x5[idx] + x6[idx] + x7[idx] + x8[idx]
y6[idx] = x6[idx] + x7[idx] + x8[idx] + x9[idx]
y7[idx] = x7[idx] + x8[idx] + x9[idx] + x10[idx]
if i nitems
y1[i] = x1[i] + x2[i] + x3[i] + x4[i]
y2[i] = x1[i] * x2[i] * x3[i] * x4[i]
y3[i] = x1[i] + x2[i] - x3[i] + x4[i]
y4[i] = x1[i] * x2[i] + x3[i] * x4[i]
end
end
return nothing
Expand All @@ -61,31 +57,25 @@ end
# ===========================================

function perf_kernel_unfused!(X, Y)
(; x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) = X
(; y1, y2, y3, y4, y5, y6, y7, y8, y9, y10) = Y
# 7 writes; 10 unique reads
# 7 writes; 28 reads including redundant ones
(; x1, x2, x3, x4) = X
(; y1, y2, y3, y4) = Y
# 4 writes; 4 unique reads
# 4 writes; 16 reads including redundant ones
@. y1 = x1 + x2 + x3 + x4
@. y2 = x2 + x3 + x4 + x5
@. y3 = x3 + x4 + x5 + x6
@. y4 = x4 + x5 + x6 + x7
@. y5 = x5 + x6 + x7 + x8
@. y6 = x6 + x7 + x8 + x9
@. y7 = x7 + x8 + x9 + x10
@. y2 = x1 * x2 * x3 * x4
@. y3 = x1 + x2 - x3 + x4
@. y4 = x1 * x2 + x3 * x4
return nothing
end

function perf_kernel_fused!(X, Y)
(; x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) = X
(; y1, y2, y3, y4, y5, y6, y7, y8, y9, y10) = Y
(; x1, x2, x3, x4) = X
(; y1, y2, y3, y4) = Y
MBF.@fused_direct begin
@. y1 = x1 + x2 + x3 + x4
@. y2 = x2 + x3 + x4 + x5
@. y3 = x3 + x4 + x5 + x6
@. y4 = x4 + x5 + x6 + x7
@. y5 = x5 + x6 + x7 + x8
@. y6 = x6 + x7 + x8 + x9
@. y7 = x7 + x8 + x9 + x10
@. y2 = x1 * x2 * x3 * x4
@. y3 = x1 + x2 - x3 + x4
@. y4 = x1 * x2 + x3 * x4
end
end

Expand All @@ -95,7 +85,7 @@ AType = use_cuda ? CUDA.CuArray : Array
device_name = use_cuda ? CUDA.name(CUDA.device()) : "CPU"
bm = Benchmark(; device_name, float_type = Float32)

problem_size = (50, 5, 5, 6, 50)
problem_size = (50, 5, 5, 6, 5400)

array_size = problem_size # array
X = get_arrays(:x, AType, bm.float_type, array_size)
Expand All @@ -118,7 +108,7 @@ push_benchmark!(
perf_kernel_unfused!,
X,
Y;
n_reads_writes = 7 + 10,
n_reads_writes = 4 + 4,
problem_size = array_size,
)
push_benchmark!(
Expand All @@ -127,7 +117,7 @@ push_benchmark!(
perf_kernel_fused!,
X,
Y;
n_reads_writes = 7 + 10,
n_reads_writes = 4 + 4,
problem_size = array_size,
)
use_cuda && push_benchmark!(
Expand All @@ -136,7 +126,7 @@ use_cuda && push_benchmark!(
perf_kernel_hard_coded!,
X,
Y;
n_reads_writes = 7 + 10,
n_reads_writes = 4 + 4,
problem_size = array_size,
)

Expand All @@ -161,7 +151,7 @@ push_benchmark!(
perf_kernel_unfused!,
X,
Y;
n_reads_writes = 7 + 10,
n_reads_writes = 4 + 4,
problem_size = array_size,
)
push_benchmark!(
Expand All @@ -170,7 +160,7 @@ push_benchmark!(
perf_kernel_fused!,
X,
Y;
n_reads_writes = 7 + 10,
n_reads_writes = 4 + 4,
problem_size = array_size,
)
use_cuda && push_benchmark!(
Expand All @@ -179,7 +169,7 @@ use_cuda && push_benchmark!(
perf_kernel_hard_coded!,
X,
Y;
n_reads_writes = 7 + 10,
n_reads_writes = 4 + 4,
problem_size = array_size,
)

Expand Down
36 changes: 15 additions & 21 deletions test/execution/bm_fused_shared_reads.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,28 +9,22 @@ include("utils_benchmark.jl")
import MultiBroadcastFusion as MBF

function perf_kernel_shared_reads_unfused!(X, Y)
(; x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) = X
(; y1, y2, y3, y4, y5, y6, y7, y8, y9, y10) = Y
(; x1, x2, x3, x4) = X
(; y1, y2, y3, y4) = Y
@. y1 = x1 + x2 + x3 + x4
@. y2 = x2 + x3 + x4 + x5
@. y3 = x3 + x4 + x5 + x6
@. y4 = x4 + x5 + x6 + x7
@. y5 = x5 + x6 + x7 + x8
@. y6 = x6 + x7 + x8 + x9
@. y7 = x7 + x8 + x9 + x10
@. y2 = x1 * x2 * x3 * x4
@. y3 = x1 + x2 - x3 + x4
@. y4 = x1 * x2 + x3 * x4
end

function perf_kernel_shared_reads_fused!(X, Y)
(; x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) = X
(; y1, y2, y3, y4, y5, y6, y7, y8, y9, y10) = Y
(; x1, x2, x3, x4) = X
(; y1, y2, y3, y4) = Y
MBF.@fused_direct begin
@. y1 = x1 + x2 + x3 + x4
@. y2 = x2 + x3 + x4 + x5
@. y3 = x3 + x4 + x5 + x6
@. y4 = x4 + x5 + x6 + x7
@. y5 = x5 + x6 + x7 + x8
@. y6 = x6 + x7 + x8 + x9
@. y7 = x7 + x8 + x9 + x10
@. y2 = x1 * x2 * x3 * x4
@. y3 = x1 + x2 - x3 + x4
@. y4 = x1 * x2 + x3 * x4
end
end

Expand All @@ -39,7 +33,7 @@ use_cuda = @isdefined(CUDA) && CUDA.has_cuda() # will be true if you first run `
AType = use_cuda ? CUDA.CuArray : Array
device_name = use_cuda ? CUDA.name(CUDA.device()) : "CPU"
bm = Benchmark(; device_name, float_type = Float32)
problem_size = (50, 5, 5, 6, 50)
problem_size = (50, 5, 5, 6, 5400)

array_size = problem_size # array
X = get_arrays(:x, AType, bm.float_type, array_size)
Expand All @@ -56,7 +50,7 @@ push_benchmark!(
perf_kernel_shared_reads_unfused!,
X,
Y;
n_reads_writes = 7 + 10,
n_reads_writes = 4 + 4,
problem_size = array_size,
)
push_benchmark!(
Expand All @@ -65,7 +59,7 @@ push_benchmark!(
perf_kernel_shared_reads_fused!,
X,
Y;
n_reads_writes = 7 + 10,
n_reads_writes = 4 + 4,
problem_size = array_size,
)

Expand All @@ -84,7 +78,7 @@ push_benchmark!(
perf_kernel_shared_reads_unfused!,
X,
Y;
n_reads_writes = 7 + 10,
n_reads_writes = 4 + 4,
problem_size = array_size,
)
push_benchmark!(
Expand All @@ -93,7 +87,7 @@ push_benchmark!(
perf_kernel_shared_reads_fused!,
X,
Y;
n_reads_writes = 7 + 10,
n_reads_writes = 4 + 4,
problem_size = array_size,
)

Expand Down
57 changes: 22 additions & 35 deletions test/execution/bm_fused_shared_reads_writes.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,48 +9,35 @@ include("utils_benchmark.jl")
import MultiBroadcastFusion as MBF

function perf_kernel_shared_reads_writes_unfused!(X, Y)
(; x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) = X
(; y1, y2, y3, y4, y5, y6, y7, y8, y9, y10) = Y
# Totoal: 10 writes, 15 reads, and 5 read/write overlaps
@. y1 = x1 + x6
@. y2 = x2 + x7
@. y3 = x3 + x8
@. y4 = x4 + x9
@. y5 = x5 + x10
@. y6 = y1
@. y7 = y2
@. y8 = y3
@. y9 = y4
@. y10 = y5
(; x1, x2, x3, x4) = X
(; y1, y2, y3, y4) = Y
# Total: 4 writes, 8 reads (including redundants)
# Theoretical minimum: 4 + 4 read/writes
@. y1 = x1 + x3
@. y2 = x2 + x4
@. y3 = y1 + x4
@. y4 = y2 + y3
end

function perf_kernel_shared_reads_writes_fused!(X, Y)
(; x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) = X
(; y1, y2, y3, y4, y5, y6, y7, y8, y9, y10) = Y
(; x1, x2, x3, x4) = X
(; y1, y2, y3, y4) = Y
# Total: 4 writes, 8 reads (including redundants)
# Theoretical minimum: 4 + 4 read/writes
MBF.@fused_direct begin
@. y1 = x1 + x6
@. y2 = x2 + x7
@. y3 = x3 + x8
@. y4 = x4 + x9
@. y5 = x5 + x10
@. y6 = y1
@. y7 = y2
@. y8 = y3
@. y9 = y4
@. y10 = y5
@. y1 = x1 + x3
@. y2 = x2 + x4
@. y3 = y1 + x4
@. y4 = y2 + y3
end
end

@static get(ENV, "USE_CUDA", nothing) == "true" && using CUDA
use_cuda = @isdefined(CUDA) && CUDA.has_cuda() # will be true if you first run `using CUDA`
AType = use_cuda ? CUDA.CuArray : Array
device_name = use_cuda ? CUDA.name(CUDA.device()) : "CPU"
bm = Benchmark(;
problem_size = (prod((50, 5, 5, 6, 50)),),
device_name,
float_type = Float32,
)
problem_size = (50, 5, 5, 6, 50)
bm = Benchmark(; device_name, float_type = Float32)
problem_size = (50, 5, 5, 6, 5400)

array_size = problem_size # array
X = get_arrays(:x, AType, bm.float_type, array_size)
Expand All @@ -68,7 +55,7 @@ push_benchmark!(
perf_kernel_shared_reads_writes_unfused!,
X,
Y;
n_reads_writes = 10 + 15,
n_reads_writes = 4 + 4,
problem_size = array_size,
)
push_benchmark!(
Expand All @@ -77,7 +64,7 @@ push_benchmark!(
perf_kernel_shared_reads_writes_fused!,
X,
Y;
n_reads_writes = 10 + 15,
n_reads_writes = 4 + 4,
problem_size = array_size,
)

Expand All @@ -97,7 +84,7 @@ push_benchmark!(
perf_kernel_shared_reads_writes_unfused!,
X,
Y;
n_reads_writes = 10 + 15,
n_reads_writes = 4 + 4,
problem_size = array_size,
)
push_benchmark!(
Expand All @@ -106,7 +93,7 @@ push_benchmark!(
perf_kernel_shared_reads_writes_fused!,
X,
Y;
n_reads_writes = 10 + 15,
n_reads_writes = 4 + 4,
problem_size = array_size,
)

Expand Down
2 changes: 1 addition & 1 deletion test/execution/utils_setup.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
get_array(AType, FT, s) = AType(zeros(FT, s...))

function get_arrays(sym, AType, FT, s, n = 10)
function get_arrays(sym, AType, FT, s, n = 4)
println("array_type = $AType")
fn = ntuple(i -> Symbol(sym, i), n)
return (; zip(fn, ntuple(_ -> get_array(AType, FT, s), n))...)
Expand Down
Loading

0 comments on commit 86c3392

Please sign in to comment.