Skip to content

Commit

Permalink
Revamp benchmarks
Browse files Browse the repository at this point in the history
  • Loading branch information
charleskawczynski committed Oct 3, 2024
1 parent 4e08f90 commit 8ebc0c4
Show file tree
Hide file tree
Showing 8 changed files with 597 additions and 150 deletions.
5 changes: 3 additions & 2 deletions flame/flame.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@ include(joinpath(pkgdir(MBF), "test", "execution", "utils.jl"))

# ===========================================

has_cuda = CUDA.has_cuda()
AType = has_cuda ? CUDA.CuArray : Array
@static get(ENV, "USE_CUDA", nothing) == "true" && using CUDA
use_cuda = @isdefined(CUDA) && CUDA.has_cuda() # will be true if you first run `using CUDA`
AType = use_cuda ? CUDA.CuArray : Array
# arr_size = (prod((50,5,5,6,50)),)
arr_size = (50, 5, 5, 6, 50)
X = get_arrays(:x, arr_size, AType)
Expand Down
160 changes: 118 additions & 42 deletions test/execution/bm_fused_reads_vs_hard_coded.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
#=
using Revise; include(joinpath("test", "execution", "bm_fused_reads_vs_hard_coded.jl"))
=#
include("utils.jl")
include("utils_test.jl")
include("utils_setup.jl")
include("utils_benchmark.jl")

import MultiBroadcastFusion as MBF

# =========================================== hard-coded implementations
perf_kernel_hard_coded!(X, Y) = perf_kernel_hard_coded!(X, Y, MBF.device(X.x1))
Expand All @@ -19,45 +23,43 @@ function perf_kernel_hard_coded!(X, Y, ::MBF.CPU)
y7[i] = x7[i] + x8[i] + x9[i] + x10[i]
end
end
function perf_kernel_hard_coded!(X, Y, ::MBF.GPU)
x1 = X.x1
nitems = length(parent(x1))
max_threads = 256 # can be higher if conditions permit
nthreads = min(max_threads, nitems)
nblocks = cld(nitems, nthreads)
CUDA.@cuda threads = (nthreads) blocks = (nblocks) knl_multi_copyto_hard_coded!(
X,
Y,
Val(nitems),
)
end
function knl_multi_copyto_hard_coded!(X, Y, ::Val{nitems}) where {nitems}
(; x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) = X
(; y1, y2, y3, y4, y5, y6, y7, y8, y9, y10) = Y
idx = CUDA.threadIdx().x + (CUDA.blockIdx().x - 1) * CUDA.blockDim().x
@inbounds begin
if idx nitems
y1[idx] = x1[idx] + x2[idx] + x3[idx] + x4[idx]
y2[idx] = x2[idx] + x3[idx] + x4[idx] + x5[idx]
y3[idx] = x3[idx] + x4[idx] + x5[idx] + x6[idx]
y4[idx] = x4[idx] + x5[idx] + x6[idx] + x7[idx]
y5[idx] = x5[idx] + x6[idx] + x7[idx] + x8[idx]
y6[idx] = x6[idx] + x7[idx] + x8[idx] + x9[idx]
y7[idx] = x7[idx] + x8[idx] + x9[idx] + x10[idx]

@static get(ENV, "USE_CUDA", nothing) == "true" && using CUDA
use_cuda = @isdefined(CUDA) && CUDA.has_cuda() # will be true if you first run `using CUDA`
@static if use_cuda
function perf_kernel_hard_coded!(X, Y, ::MBF.GPU)
x1 = X.x1
nitems = length(parent(x1))
max_threads = 256 # can be higher if conditions permit
nthreads = min(max_threads, nitems)
nblocks = cld(nitems, nthreads)
CUDA.@cuda threads = (nthreads) blocks = (nblocks) knl_multi_copyto_hard_coded!(
X,
Y,
Val(nitems),
)
end
function knl_multi_copyto_hard_coded!(X, Y, ::Val{nitems}) where {nitems}
(; x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) = X
(; y1, y2, y3, y4, y5, y6, y7, y8, y9, y10) = Y
idx = CUDA.threadIdx().x + (CUDA.blockIdx().x - 1) * CUDA.blockDim().x
@inbounds begin
if idx nitems
y1[idx] = x1[idx] + x2[idx] + x3[idx] + x4[idx]
y2[idx] = x2[idx] + x3[idx] + x4[idx] + x5[idx]
y3[idx] = x3[idx] + x4[idx] + x5[idx] + x6[idx]
y4[idx] = x4[idx] + x5[idx] + x6[idx] + x7[idx]
y5[idx] = x5[idx] + x6[idx] + x7[idx] + x8[idx]
y6[idx] = x6[idx] + x7[idx] + x8[idx] + x9[idx]
y7[idx] = x7[idx] + x8[idx] + x9[idx] + x10[idx]
end
end
return nothing
end
return nothing
end

# ===========================================

has_cuda = CUDA.has_cuda()
AType = has_cuda ? CUDA.CuArray : Array
arr_size = (prod((50, 5, 5, 6, 50)),)
# arr_size = (50,5,5,6,50)
X = get_arrays(:x, arr_size, AType);
Y = get_arrays(:y, arr_size, AType);

function perf_kernel_unfused!(X, Y)
(; x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) = X
(; y1, y2, y3, y4, y5, y6, y7, y8, y9, y10) = Y
Expand Down Expand Up @@ -87,27 +89,101 @@ function perf_kernel_fused!(X, Y)
end
end

@static get(ENV, "USE_CUDA", nothing) == "true" && using CUDA
use_cuda = @isdefined(CUDA) && CUDA.has_cuda() # will be true if you first run `using CUDA`
AType = use_cuda ? CUDA.CuArray : Array
device_name = use_cuda ? CUDA.name(CUDA.device()) : "CPU"
bm = Benchmark(; device_name, float_type = Float32)

problem_size = (50, 5, 5, 6, 50)

array_size = problem_size # array
X = get_arrays(:x, AType, bm.float_type, array_size)
Y = get_arrays(:y, AType, bm.float_type, array_size)
test_kernel!(;
fused! = perf_kernel_fused!,
unfused! = perf_kernel_unfused!,
X,
Y,
)
use_cuda && test_kernel!(;
fused! = perf_kernel_hard_coded!,
unfused! = perf_kernel_unfused!,
X,
Y,
)
push_benchmark!(
bm,
use_cuda,
perf_kernel_unfused!,
X,
Y;
n_reads_writes = 7 + 10,
problem_size = array_size,
)
push_benchmark!(
bm,
use_cuda,
perf_kernel_fused!,
X,
Y;
n_reads_writes = 7 + 10,
problem_size = array_size,
)
use_cuda && push_benchmark!(
bm,
use_cuda,
perf_kernel_hard_coded!,
X,
Y;
n_reads_writes = 7 + 10,
problem_size = array_size,
)

array_size = (prod(problem_size),) # vector
X = get_arrays(:x, AType, bm.float_type, array_size)
Y = get_arrays(:y, AType, bm.float_type, array_size)
test_kernel!(;
fused! = perf_kernel_fused!,
unfused! = perf_kernel_unfused!,
X,
Y,
)
use_cuda && test_kernel!(;
fused! = perf_kernel_hard_coded!,
unfused! = perf_kernel_unfused!,
X,
Y,
)
push_benchmark!(
bm,
use_cuda,
perf_kernel_unfused!,
X,
Y;
n_reads_writes = 7 + 10,
problem_size = array_size,
)
push_benchmark!(
bm,
use_cuda,
perf_kernel_fused!,
X,
Y;
n_reads_writes = 7 + 10,
problem_size = array_size,
)
use_cuda && push_benchmark!(
bm,
use_cuda,
perf_kernel_hard_coded!,
X,
Y;
n_reads_writes = 7 + 10,
problem_size = array_size,
)

# Compile
perf_kernel_unfused!(X, Y)
perf_kernel_fused!(X, Y)
perf_kernel_hard_coded!(X, Y)

# Benchmark
benchmark_kernel!(perf_kernel_unfused!, X, Y)
benchmark_kernel!(perf_kernel_fused!, X, Y)
benchmark_kernel!(perf_kernel_hard_coded!, X, Y)
tabulate_benchmark(bm)

nothing
73 changes: 61 additions & 12 deletions test/execution/bm_fused_shared_reads.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@
using Revise; include(joinpath("test", "execution", "bm_fused_shared_reads.jl"))
=#

include("utils.jl")
include("utils_test.jl")
include("utils_setup.jl")
include("utils_benchmark.jl")

import MultiBroadcastFusion as MBF

function perf_kernel_shared_reads_unfused!(X, Y)
(; x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) = X
Expand Down Expand Up @@ -30,24 +34,69 @@ function perf_kernel_shared_reads_fused!(X, Y)
end
end

has_cuda = CUDA.has_cuda()
AType = has_cuda ? CUDA.CuArray : Array
arr_size = (prod((50, 5, 5, 6, 50)),)
X = get_arrays(:x, arr_size, AType)
Y = get_arrays(:y, arr_size, AType)
@static get(ENV, "USE_CUDA", nothing) == "true" && using CUDA
use_cuda = @isdefined(CUDA) && CUDA.has_cuda() # will be true if you first run `using CUDA`
AType = use_cuda ? CUDA.CuArray : Array
device_name = use_cuda ? CUDA.name(CUDA.device()) : "CPU"
bm = Benchmark(; device_name, float_type = Float32)
problem_size = (50, 5, 5, 6, 50)

array_size = problem_size # array
X = get_arrays(:x, AType, bm.float_type, array_size)
Y = get_arrays(:y, AType, bm.float_type, array_size)
test_kernel!(;
fused! = perf_kernel_shared_reads_fused!,
unfused! = perf_kernel_shared_reads_unfused!,
X,
Y,
)
push_benchmark!(
bm,
use_cuda,
perf_kernel_shared_reads_unfused!,
X,
Y;
n_reads_writes = 7 + 10,
problem_size = array_size,
)
push_benchmark!(
bm,
use_cuda,
perf_kernel_shared_reads_fused!,
X,
Y;
n_reads_writes = 7 + 10,
problem_size = array_size,
)

array_size = (prod(problem_size),) # vector
X = get_arrays(:x, AType, bm.float_type, array_size)
Y = get_arrays(:y, AType, bm.float_type, array_size)
test_kernel!(;
fused! = perf_kernel_shared_reads_fused!,
unfused! = perf_kernel_shared_reads_unfused!,
X,
Y,
)
# Compile
perf_kernel_shared_reads_unfused!(X, Y)
perf_kernel_shared_reads_fused!(X, Y)
push_benchmark!(
bm,
use_cuda,
perf_kernel_shared_reads_unfused!,
X,
Y;
n_reads_writes = 7 + 10,
problem_size = array_size,
)
push_benchmark!(
bm,
use_cuda,
perf_kernel_shared_reads_fused!,
X,
Y;
n_reads_writes = 7 + 10,
problem_size = array_size,
)

# Benchmark
benchmark_kernel!(perf_kernel_shared_reads_unfused!, X, Y)
benchmark_kernel!(perf_kernel_shared_reads_fused!, X, Y)
tabulate_benchmark(bm)

nothing
Loading

0 comments on commit 8ebc0c4

Please sign in to comment.