From 1684216d4b9618647cab6c82cbb25e1ff2f6642a Mon Sep 17 00:00:00 2001 From: Charles Kawczynski Date: Fri, 4 Oct 2024 10:39:16 -0400 Subject: [PATCH] Split unit tests from benchmarks --- .buildkite/pipeline.yml | 22 +++++++++- ext/MultiBroadcastFusionCUDAExt.jl | 15 +++++-- .../execution/bm_fused_reads_vs_hard_coded.jl | 12 +++-- test/execution/bm_fused_shared_reads.jl | 6 ++- .../execution/bm_fused_shared_reads_writes.jl | 6 ++- test/execution/utils_benchmark.jl | 10 +++++ test/execution/utils_test.jl | 44 ++++++++++++------- 7 files changed, 86 insertions(+), 29 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index a3ff316..63ec6bc 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -39,15 +39,35 @@ steps: - label: "CPU tests" key: tests_cpu - command: "julia --color=yes --project=.buildkite test/runtests.jl" + command: "julia --color=yes --check-bounds=yes --project=.buildkite test/runtests.jl" - label: "CUDA tests" key: tests_cuda + command: + - "julia --project=.buildkite -e 'using CUDA; CUDA.versioninfo()'" + - "julia --color=yes --check-bounds=yes --project=.buildkite test/runtests.jl" + env: + USE_CUDA: "true" + agents: + slurm_gpus: 1 + + - group: "Benchmarks" + steps: + + - label: "CPU benchmarks" + key: bm_cpu + command: "julia --color=yes --project=.buildkite test/runtests.jl" + env: + PERFORM_BENCHMARK: "true" + + - label: "CUDA benchmarks" + key: bm_cuda command: - "julia --project=.buildkite -e 'using CUDA; CUDA.versioninfo()'" - "julia --color=yes --project=.buildkite test/runtests.jl" env: USE_CUDA: "true" + PERFORM_BENCHMARK: "true" agents: slurm_gpus: 1 diff --git a/ext/MultiBroadcastFusionCUDAExt.jl b/ext/MultiBroadcastFusionCUDAExt.jl index 61e3895..68459de 100644 --- a/ext/MultiBroadcastFusionCUDAExt.jl +++ b/ext/MultiBroadcastFusionCUDAExt.jl @@ -9,20 +9,29 @@ MBF.device(x::CUDA.CuArray) = MBF.MBF_CUDA() function fused_copyto!(fmb::MBF.FusedMultiBroadcast, ::MBF.MBF_CUDA) (; pairs) = fmb dest = first(pairs).first + destinations = map(p -> p.first, pairs) nitems = length(parent(dest)) max_threads = 256 # can be higher if conditions permit nthreads = min(max_threads, nitems) nblocks = cld(nitems, nthreads) - CUDA.@cuda threads = (nthreads) blocks = (nblocks) fused_copyto_kernel!(fmb) + a1 = axes(dest) + all(a -> axes(a) == axes(dest), destinations) || + error("Cannot fuse broadcast expressions with unequal broadcast axes") + CI = CartesianIndices(axes(dest)) + CUDA.@cuda threads = (nthreads) blocks = (nblocks) fused_copyto_kernel!( + fmb, + CI, + ) return nothing end -function fused_copyto_kernel!(fmb::MBF.FusedMultiBroadcast) +import Base.Broadcast +function fused_copyto_kernel!(fmb::MBF.FusedMultiBroadcast, CI) (; pairs) = fmb dest = first(pairs).first nitems = length(dest) idx = CUDA.threadIdx().x + (CUDA.blockIdx().x - 1) * CUDA.blockDim().x if idx ≤ nitems - MBF.rcopyto_at!(pairs, idx) + MBF.rcopyto_at!(pairs, CI[idx]) end return nothing end diff --git a/test/execution/bm_fused_reads_vs_hard_coded.jl b/test/execution/bm_fused_reads_vs_hard_coded.jl index 18695e3..62bcd2f 100644 --- a/test/execution/bm_fused_reads_vs_hard_coded.jl +++ b/test/execution/bm_fused_reads_vs_hard_coded.jl @@ -90,13 +90,15 @@ problem_size = (50, 5, 5, 6, 5400) array_size = problem_size # array X = get_arrays(:x, AType, bm.float_type, array_size) Y = get_arrays(:y, AType, bm.float_type, array_size) -test_kernel!(; +test_kernel!( + use_cuda; fused! = perf_kernel_fused!, unfused! = perf_kernel_unfused!, X, Y, ) -use_cuda && test_kernel!(; +use_cuda && test_kernel!( + use_cuda; fused! = perf_kernel_hard_coded!, unfused! = perf_kernel_unfused!, X, @@ -133,13 +135,15 @@ use_cuda && push_benchmark!( array_size = (prod(problem_size),) # vector X = get_arrays(:x, AType, bm.float_type, array_size) Y = get_arrays(:y, AType, bm.float_type, array_size) -test_kernel!(; +test_kernel!( + use_cuda; fused! = perf_kernel_fused!, unfused! = perf_kernel_unfused!, X, Y, ) -use_cuda && test_kernel!(; +use_cuda && test_kernel!( + use_cuda; fused! = perf_kernel_hard_coded!, unfused! = perf_kernel_unfused!, X, diff --git a/test/execution/bm_fused_shared_reads.jl b/test/execution/bm_fused_shared_reads.jl index b23f6fb..16d5cb6 100644 --- a/test/execution/bm_fused_shared_reads.jl +++ b/test/execution/bm_fused_shared_reads.jl @@ -38,7 +38,8 @@ problem_size = (50, 5, 5, 6, 5400) array_size = problem_size # array X = get_arrays(:x, AType, bm.float_type, array_size) Y = get_arrays(:y, AType, bm.float_type, array_size) -test_kernel!(; +test_kernel!( + use_cuda; fused! = perf_kernel_shared_reads_fused!, unfused! = perf_kernel_shared_reads_unfused!, X, @@ -66,7 +67,8 @@ push_benchmark!( array_size = (prod(problem_size),) # vector X = get_arrays(:x, AType, bm.float_type, array_size) Y = get_arrays(:y, AType, bm.float_type, array_size) -test_kernel!(; +test_kernel!( + use_cuda; fused! = perf_kernel_shared_reads_fused!, unfused! = perf_kernel_shared_reads_unfused!, X, diff --git a/test/execution/bm_fused_shared_reads_writes.jl b/test/execution/bm_fused_shared_reads_writes.jl index c3f8a61..7a9927d 100644 --- a/test/execution/bm_fused_shared_reads_writes.jl +++ b/test/execution/bm_fused_shared_reads_writes.jl @@ -42,7 +42,8 @@ problem_size = (50, 5, 5, 6, 5400) array_size = problem_size # array X = get_arrays(:x, AType, bm.float_type, array_size) Y = get_arrays(:y, AType, bm.float_type, array_size) -test_kernel!(; +test_kernel!( + use_cuda; unfused! = perf_kernel_shared_reads_writes_unfused!, fused! = perf_kernel_shared_reads_writes_fused!, X, @@ -71,7 +72,8 @@ push_benchmark!( array_size = (prod(problem_size),) # vector X = get_arrays(:x, AType, bm.float_type, array_size) Y = get_arrays(:y, AType, bm.float_type, array_size) -test_kernel!(; +test_kernel!( + use_cuda; unfused! = perf_kernel_shared_reads_writes_unfused!, fused! = perf_kernel_shared_reads_writes_fused!, X, diff --git a/test/execution/utils_benchmark.jl b/test/execution/utils_benchmark.jl index 97c2c9e..5539e38 100644 --- a/test/execution/utils_benchmark.jl +++ b/test/execution/utils_benchmark.jl @@ -60,6 +60,11 @@ trunc_time(s::String) = count(',', s) > 1 ? join(split(s, ",")[1:2], ",") : s import PrettyTables function tabulate_benchmark(bm) + perform_benchmark = get(ENV, "PERFORM_BENCHMARK", false) == "true" + if !perform_benchmark + @warn "Benchmark skipped, set `ENV[\"PERFORM_BENCHMARK\"] = true` to run benchmarks" + return nothing + end funcs = map(x -> strip(x.caller), bm.data) timings = map(x -> time_and_units_str(x.kernel_time_s), bm.data) n_reads_writes = map(x -> x.n_reads_writes, bm.data) @@ -159,6 +164,11 @@ function benchmark_trial!(use_cuda, f!, X, Y) end function push_benchmark!(bm, use_cuda, f!, X, Y; n_reads_writes, problem_size) + perform_benchmark = get(ENV, "PERFORM_BENCHMARK", false) == "true" + if !perform_benchmark + @warn "Benchmark skipped, set `ENV[\"PERFORM_BENCHMARK\"] = true` to run benchmarks" + return nothing + end f!(X, Y) # compile first trial = benchmark_trial!(use_cuda, f!, X, Y) e = minimum(trial.times) * 1e-9 # to seconds diff --git a/test/execution/utils_test.jl b/test/execution/utils_test.jl index e9bc638..266dfa9 100644 --- a/test/execution/utils_test.jl +++ b/test/execution/utils_test.jl @@ -21,9 +21,14 @@ function __rprint_diff(io::IO, xi, yi; pc, xname, yname) # assume we can compute xs = xname * string(join(pc)) ys = yname * string(join(pc)) println(io, "==================== Difference found:") - println(io, "$xs: ", xi) - println(io, "$ys: ", yi) - println(io, "($xs .- $ys): ", (xi .- yi)) + println(io, "maximum(abs.(Δ)) = $(maximum(abs.(xi .- yi)))") + println(io, "maximum(abs.(xi)) = $(maximum(abs.(xi)))") + println(io, "maximum(abs.(yi)) = $(maximum(abs.(yi)))") + println(io, "extrema(xi) = $(extrema(xi))") + println(io, "extrema(yi) = $(extrema(yi))") + # println(io, "$xs: ", xi) + # println(io, "$ys: ", yi) + # println(io, "($xs .- $ys): ", (xi .- yi)) end return nothing end @@ -56,11 +61,18 @@ end # Recursively compare contents of similar types -_rcompare(pass, x::T, y::T) where {T} = pass && (x == y) +function _rcompare(pass, x::T, y::T; use_cuda) where {T} + if use_cuda + return pass && (x ≈ y) # CUDA doesn't always satisfy == + else + return pass && (x == y) + end +end -function _rcompare(pass, x::T, y::T) where {T <: NamedTuple} +function _rcompare(pass, x::T, y::T; use_cuda) where {T <: NamedTuple} for pn in propertynames(x) - pass &= _rcompare(pass, getproperty(x, pn), getproperty(y, pn)) + pass &= + _rcompare(pass, getproperty(x, pn), getproperty(y, pn); use_cuda) end return pass end @@ -71,18 +83,16 @@ end Recursively compare given types via `==`. Returns `true` if `x == y` recursively. """ -rcompare(x::T, y::T) where {T <: NamedTuple} = _rcompare(true, x, y) -rcompare(x, y) = false +rcompare(x::T, y::T; use_cuda) where {T <: NamedTuple} = + _rcompare(true, x, y; use_cuda) +rcompare(x, y; use_cuda) = false -function test_compare(x, y) - if !rcompare(x, y) - @rprint_diff(x, y) - else - @test rcompare(x, y) - end +function test_compare(x, y; use_cuda) + rcompare(x, y; use_cuda) || @rprint_diff(x, y) + @test rcompare(x, y; use_cuda) end -function test_kernel!(; fused!, unfused!, X, Y) +function test_kernel!(use_cuda; fused!, unfused!, X, Y) for x in X x .= map(_ -> rand(), x) end @@ -96,7 +106,7 @@ function test_kernel!(; fused!, unfused!, X, Y) fused!(X_fused, Y_fused) unfused!(X_unfused, Y_unfused) @testset "Test correctness of $(nameof(typeof(fused!)))" begin - test_compare(X_fused, X_unfused) - test_compare(Y_fused, Y_unfused) + test_compare(X_fused, X_unfused; use_cuda) + test_compare(Y_fused, Y_unfused; use_cuda) end end