diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index a3ff316..977a369 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -39,15 +39,35 @@ steps: - label: "CPU tests" key: tests_cpu - command: "julia --color=yes --project=.buildkite test/runtests.jl" + command: "julia --color=yes --check-bounds=yes --project=.buildkite test/runtests.jl" - label: "CUDA tests" key: tests_cuda + command: + - "julia --project=.buildkite -e 'using CUDA; CUDA.versioninfo()'" + - "julia --color=yes --check-bounds=yes --project=.buildkite test/runtests.jl" + env: + USE_CUDA: "true" + agents: + slurm_gpus: 1 + + - group: "Benchmarks" + steps: + + - label: "CPU benchmarks" + key: bm_cpu + command: "julia --color=yes --project=.buildkite test/runtests.jl" + env: + PERFORM_BENCHMARKS: "true" + + - label: "CUDA benchmarks" + key: bm_cuda command: - "julia --project=.buildkite -e 'using CUDA; CUDA.versioninfo()'" - "julia --color=yes --project=.buildkite test/runtests.jl" env: USE_CUDA: "true" + PERFORM_BENCHMARKS: "true" agents: slurm_gpus: 1 diff --git a/ext/MultiBroadcastFusionCUDAExt.jl b/ext/MultiBroadcastFusionCUDAExt.jl index 61e3895..68459de 100644 --- a/ext/MultiBroadcastFusionCUDAExt.jl +++ b/ext/MultiBroadcastFusionCUDAExt.jl @@ -9,20 +9,29 @@ MBF.device(x::CUDA.CuArray) = MBF.MBF_CUDA() function fused_copyto!(fmb::MBF.FusedMultiBroadcast, ::MBF.MBF_CUDA) (; pairs) = fmb dest = first(pairs).first + destinations = map(p -> p.first, pairs) nitems = length(parent(dest)) max_threads = 256 # can be higher if conditions permit nthreads = min(max_threads, nitems) nblocks = cld(nitems, nthreads) - CUDA.@cuda threads = (nthreads) blocks = (nblocks) fused_copyto_kernel!(fmb) + a1 = axes(dest) + all(a -> axes(a) == axes(dest), destinations) || + error("Cannot fuse broadcast expressions with unequal broadcast axes") + CI = CartesianIndices(axes(dest)) + CUDA.@cuda threads = (nthreads) blocks = (nblocks) fused_copyto_kernel!( + fmb, + CI, + ) return nothing end -function fused_copyto_kernel!(fmb::MBF.FusedMultiBroadcast) +import Base.Broadcast +function fused_copyto_kernel!(fmb::MBF.FusedMultiBroadcast, CI) (; pairs) = fmb dest = first(pairs).first nitems = length(dest) idx = CUDA.threadIdx().x + (CUDA.blockIdx().x - 1) * CUDA.blockDim().x if idx ≤ nitems - MBF.rcopyto_at!(pairs, idx) + MBF.rcopyto_at!(pairs, CI[idx]) end return nothing end diff --git a/test/execution/utils_benchmark.jl b/test/execution/utils_benchmark.jl index 97c2c9e..5539e38 100644 --- a/test/execution/utils_benchmark.jl +++ b/test/execution/utils_benchmark.jl @@ -60,6 +60,11 @@ trunc_time(s::String) = count(',', s) > 1 ? join(split(s, ",")[1:2], ",") : s import PrettyTables function tabulate_benchmark(bm) + perform_benchmark = get(ENV, "PERFORM_BENCHMARK", false) == "true" + if !perform_benchmark + @warn "Benchmark skipped, set `ENV[\"PERFORM_BENCHMARK\"] = true` to run benchmarks" + return nothing + end funcs = map(x -> strip(x.caller), bm.data) timings = map(x -> time_and_units_str(x.kernel_time_s), bm.data) n_reads_writes = map(x -> x.n_reads_writes, bm.data) @@ -159,6 +164,11 @@ function benchmark_trial!(use_cuda, f!, X, Y) end function push_benchmark!(bm, use_cuda, f!, X, Y; n_reads_writes, problem_size) + perform_benchmark = get(ENV, "PERFORM_BENCHMARK", false) == "true" + if !perform_benchmark + @warn "Benchmark skipped, set `ENV[\"PERFORM_BENCHMARK\"] = true` to run benchmarks" + return nothing + end f!(X, Y) # compile first trial = benchmark_trial!(use_cuda, f!, X, Y) e = minimum(trial.times) * 1e-9 # to seconds diff --git a/test/execution/utils_test.jl b/test/execution/utils_test.jl index e9bc638..f569b63 100644 --- a/test/execution/utils_test.jl +++ b/test/execution/utils_test.jl @@ -75,11 +75,8 @@ rcompare(x::T, y::T) where {T <: NamedTuple} = _rcompare(true, x, y) rcompare(x, y) = false function test_compare(x, y) - if !rcompare(x, y) - @rprint_diff(x, y) - else - @test rcompare(x, y) - end + rcompare(x, y) || @rprint_diff(x, y) + @test rcompare(x, y) end function test_kernel!(; fused!, unfused!, X, Y)