Skip to content

Commit

Permalink
Update benchmarks problem size
Browse files Browse the repository at this point in the history
  • Loading branch information
charleskawczynski committed Oct 3, 2024
1 parent f3064ba commit 62913a2
Show file tree
Hide file tree
Showing 4 changed files with 14 additions and 36 deletions.
6 changes: 4 additions & 2 deletions test/execution/bm_fused_reads_vs_hard_coded.jl
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,9 @@ use_cuda = @isdefined(CUDA) && CUDA.has_cuda() # will be true if you first run `
function knl_multi_copyto_hard_coded!(X, Y, ::Val{nitems}) where {nitems}
(; x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) = X
(; y1, y2, y3, y4, y5, y6, y7, y8, y9, y10) = Y
idx = CUDA.threadIdx().x + (CUDA.blockIdx().x - 1) * CUDA.blockDim().x
idx =
CUDA.threadIdx().x +
(CUDA.blockIdx().x - Int32(1)) * CUDA.blockDim().x
@inbounds begin
if idx nitems
y1[idx] = x1[idx] + x2[idx] + x3[idx] + x4[idx]
Expand Down Expand Up @@ -95,7 +97,7 @@ AType = use_cuda ? CUDA.CuArray : Array
device_name = use_cuda ? CUDA.name(CUDA.device()) : "CPU"
bm = Benchmark(; device_name, float_type = Float32)

problem_size = (50, 5, 5, 6, 50)
problem_size = (50, 5, 5, 6, 5400)

array_size = problem_size # array
X = get_arrays(:x, AType, bm.float_type, array_size)
Expand Down
2 changes: 1 addition & 1 deletion test/execution/bm_fused_shared_reads.jl
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ use_cuda = @isdefined(CUDA) && CUDA.has_cuda() # will be true if you first run `
AType = use_cuda ? CUDA.CuArray : Array
device_name = use_cuda ? CUDA.name(CUDA.device()) : "CPU"
bm = Benchmark(; device_name, float_type = Float32)
problem_size = (50, 5, 5, 6, 50)
problem_size = (50, 5, 5, 6, 5400)

array_size = problem_size # array
X = get_arrays(:x, AType, bm.float_type, array_size)
Expand Down
17 changes: 7 additions & 10 deletions test/execution/bm_fused_shared_reads_writes.jl
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ end
function perf_kernel_shared_reads_writes_fused!(X, Y)
(; x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) = X
(; y1, y2, y3, y4, y5, y6, y7, y8, y9, y10) = Y
# Totoal: 10 writes, 10 reads, and 5 read/write overlaps
MBF.@fused_direct begin
@. y1 = x1 + x6
@. y2 = x2 + x7
Expand All @@ -45,12 +46,8 @@ end
use_cuda = @isdefined(CUDA) && CUDA.has_cuda() # will be true if you first run `using CUDA`
AType = use_cuda ? CUDA.CuArray : Array
device_name = use_cuda ? CUDA.name(CUDA.device()) : "CPU"
bm = Benchmark(;
problem_size = (prod((50, 5, 5, 6, 50)),),
device_name,
float_type = Float32,
)
problem_size = (50, 5, 5, 6, 50)
bm = Benchmark(; device_name, float_type = Float32)
problem_size = (50, 5, 5, 6, 5400)

array_size = problem_size # array
X = get_arrays(:x, AType, bm.float_type, array_size)
Expand All @@ -68,7 +65,7 @@ push_benchmark!(
perf_kernel_shared_reads_writes_unfused!,
X,
Y;
n_reads_writes = 10 + 15,
n_reads_writes = 10 + 10,
problem_size = array_size,
)
push_benchmark!(
Expand All @@ -77,7 +74,7 @@ push_benchmark!(
perf_kernel_shared_reads_writes_fused!,
X,
Y;
n_reads_writes = 10 + 15,
n_reads_writes = 10 + 10,
problem_size = array_size,
)

Expand All @@ -97,7 +94,7 @@ push_benchmark!(
perf_kernel_shared_reads_writes_unfused!,
X,
Y;
n_reads_writes = 10 + 15,
n_reads_writes = 10 + 10,
problem_size = array_size,
)
push_benchmark!(
Expand All @@ -106,7 +103,7 @@ push_benchmark!(
perf_kernel_shared_reads_writes_fused!,
X,
Y;
n_reads_writes = 10 + 15,
n_reads_writes = 10 + 10,
problem_size = array_size,
)

Expand Down
25 changes: 2 additions & 23 deletions test/execution/utils_test.jl
Original file line number Diff line number Diff line change
Expand Up @@ -90,34 +90,13 @@ function test_kernel!(; fused!, unfused!, X, Y)
y .= map(_ -> rand(), y)
end
X_fused = deepcopy(X)
X_unfused = deepcopy(X)
X_unfused = X
Y_fused = deepcopy(Y)
Y_unfused = deepcopy(Y)
Y_unfused = Y
fused!(X_fused, Y_fused)
unfused!(X_unfused, Y_unfused)
@testset "Test correctness of $(nameof(typeof(fused!)))" begin
test_compare(X_fused, X_unfused)
test_compare(Y_fused, Y_unfused)
end
end
function test_kernel_args!(; fused!, unfused!, args)
(; X, Y) = args
for x in X
x .= rand(size(x)...)
end
for y in Y
y .= rand(size(y)...)
end
X_fused = deepcopy(X)
X_unfused = deepcopy(X)
Y_fused = deepcopy(Y)
Y_unfused = deepcopy(Y)
args_fused = (; X = X_fused, Y = Y_fused)
args_unfused = (; X = X_unfused, Y = Y_unfused)
fused!(args_fused)
unfused!(args_unfused)
@testset "Test correctness of $(nameof(typeof(fused!)))" begin
test_compare(X_fused, X_unfused)
test_compare(Y_fused, Y_unfused)
end
end

0 comments on commit 62913a2

Please sign in to comment.