CliMA · charleskawczynski · Oct 4, 2024 · Oct 4, 2024
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -39,15 +39,35 @@ steps:
 
       - label: "CPU tests"
         key: tests_cpu
-        command: "julia --color=yes --project=.buildkite test/runtests.jl"
+        command: "julia --color=yes --check-bounds=yes --project=.buildkite test/runtests.jl"
 
       - label: "CUDA tests"
         key: tests_cuda
+        command:
+          - "julia --project=.buildkite -e 'using CUDA; CUDA.versioninfo()'"
+          - "julia --color=yes --check-bounds=yes --project=.buildkite test/runtests.jl"
+        env:
+          USE_CUDA: "true"
+        agents:
+          slurm_gpus: 1
+
+  - group: "Benchmarks"
+    steps:
+
+      - label: "CPU benchmarks"
+        key: bm_cpu
+        command: "julia --color=yes --project=.buildkite test/runtests.jl"
+        env:
+          PERFORM_BENCHMARK: "true"
+
+      - label: "CUDA benchmarks"
+        key: bm_cuda
         command:
           - "julia --project=.buildkite -e 'using CUDA; CUDA.versioninfo()'"
           - "julia --color=yes --project=.buildkite test/runtests.jl"
         env:
           USE_CUDA: "true"
+          PERFORM_BENCHMARK: "true"
         agents:
           slurm_gpus: 1
 
diff --git a/ext/MultiBroadcastFusionCUDAExt.jl b/ext/MultiBroadcastFusionCUDAExt.jl
@@ -9,20 +9,29 @@ MBF.device(x::CUDA.CuArray) = MBF.MBF_CUDA()
 function fused_copyto!(fmb::MBF.FusedMultiBroadcast, ::MBF.MBF_CUDA)
     (; pairs) = fmb
     dest = first(pairs).first
+    destinations = map(p -> p.first, pairs)
     nitems = length(parent(dest))
     max_threads = 256 # can be higher if conditions permit
     nthreads = min(max_threads, nitems)
     nblocks = cld(nitems, nthreads)
-    CUDA.@cuda threads = (nthreads) blocks = (nblocks) fused_copyto_kernel!(fmb)
+    a1 = axes(dest)
+    all(a -> axes(a) == axes(dest), destinations) ||
+        error("Cannot fuse broadcast expressions with unequal broadcast axes")
+    CI = CartesianIndices(axes(dest))
+    CUDA.@cuda threads = (nthreads) blocks = (nblocks) fused_copyto_kernel!(
+        fmb,
+        CI,
+    )
     return nothing
 end
-function fused_copyto_kernel!(fmb::MBF.FusedMultiBroadcast)
+import Base.Broadcast
+function fused_copyto_kernel!(fmb::MBF.FusedMultiBroadcast, CI)
     (; pairs) = fmb
     dest = first(pairs).first
     nitems = length(dest)
     idx = CUDA.threadIdx().x + (CUDA.blockIdx().x - 1) * CUDA.blockDim().x
     if idx ≤ nitems
-        MBF.rcopyto_at!(pairs, idx)
+        MBF.rcopyto_at!(pairs, CI[idx])
     end
     return nothing
 end

diff --git a/test/execution/bm_fused_reads_vs_hard_coded.jl b/test/execution/bm_fused_reads_vs_hard_coded.jl
@@ -90,13 +90,15 @@ problem_size = (50, 5, 5, 6, 5400)
 array_size = problem_size # array
 X = get_arrays(:x, AType, bm.float_type, array_size)
 Y = get_arrays(:y, AType, bm.float_type, array_size)
-test_kernel!(;
+test_kernel!(
+    use_cuda;
     fused! = perf_kernel_fused!,
     unfused! = perf_kernel_unfused!,
     X,
     Y,
 )
-use_cuda && test_kernel!(;
+use_cuda && test_kernel!(
+    use_cuda;
     fused! = perf_kernel_hard_coded!,
     unfused! = perf_kernel_unfused!,
     X,
@@ -133,13 +135,15 @@ use_cuda && push_benchmark!(
 array_size = (prod(problem_size),) # vector
 X = get_arrays(:x, AType, bm.float_type, array_size)
 Y = get_arrays(:y, AType, bm.float_type, array_size)
-test_kernel!(;
+test_kernel!(
+    use_cuda;
     fused! = perf_kernel_fused!,
     unfused! = perf_kernel_unfused!,
     X,
     Y,
 )
-use_cuda && test_kernel!(;
+use_cuda && test_kernel!(
+    use_cuda;
     fused! = perf_kernel_hard_coded!,
     unfused! = perf_kernel_unfused!,
     X,

diff --git a/test/execution/bm_fused_shared_reads.jl b/test/execution/bm_fused_shared_reads.jl
@@ -38,7 +38,8 @@ problem_size = (50, 5, 5, 6, 5400)
 array_size = problem_size # array
 X = get_arrays(:x, AType, bm.float_type, array_size)
 Y = get_arrays(:y, AType, bm.float_type, array_size)
-test_kernel!(;
+test_kernel!(
+    use_cuda;
     fused! = perf_kernel_shared_reads_fused!,
     unfused! = perf_kernel_shared_reads_unfused!,
     X,
@@ -66,7 +67,8 @@ push_benchmark!(
 array_size = (prod(problem_size),) # vector
 X = get_arrays(:x, AType, bm.float_type, array_size)
 Y = get_arrays(:y, AType, bm.float_type, array_size)
-test_kernel!(;
+test_kernel!(
+    use_cuda;
     fused! = perf_kernel_shared_reads_fused!,
     unfused! = perf_kernel_shared_reads_unfused!,
     X,

diff --git a/test/execution/bm_fused_shared_reads_writes.jl b/test/execution/bm_fused_shared_reads_writes.jl
@@ -42,7 +42,8 @@ problem_size = (50, 5, 5, 6, 5400)
 array_size = problem_size # array
 X = get_arrays(:x, AType, bm.float_type, array_size)
 Y = get_arrays(:y, AType, bm.float_type, array_size)
-test_kernel!(;
+test_kernel!(
+    use_cuda;
     unfused! = perf_kernel_shared_reads_writes_unfused!,
     fused! = perf_kernel_shared_reads_writes_fused!,
     X,
@@ -71,7 +72,8 @@ push_benchmark!(
 array_size = (prod(problem_size),) # vector
 X = get_arrays(:x, AType, bm.float_type, array_size)
 Y = get_arrays(:y, AType, bm.float_type, array_size)
-test_kernel!(;
+test_kernel!(
+    use_cuda;
     unfused! = perf_kernel_shared_reads_writes_unfused!,
     fused! = perf_kernel_shared_reads_writes_fused!,
     X,

diff --git a/test/execution/utils_benchmark.jl b/test/execution/utils_benchmark.jl
@@ -60,6 +60,11 @@ trunc_time(s::String) = count(',', s) > 1 ? join(split(s, ",")[1:2], ",") : s
 
 import PrettyTables
 function tabulate_benchmark(bm)
+    perform_benchmark = get(ENV, "PERFORM_BENCHMARK", false) == "true"
+    if !perform_benchmark
+        @warn "Benchmark skipped, set `ENV[\"PERFORM_BENCHMARK\"] = true` to run benchmarks"
+        return nothing
+    end
     funcs = map(x -> strip(x.caller), bm.data)
     timings = map(x -> time_and_units_str(x.kernel_time_s), bm.data)
     n_reads_writes = map(x -> x.n_reads_writes, bm.data)
@@ -159,6 +164,11 @@ function benchmark_trial!(use_cuda, f!, X, Y)
 end
 
 function push_benchmark!(bm, use_cuda, f!, X, Y; n_reads_writes, problem_size)
+    perform_benchmark = get(ENV, "PERFORM_BENCHMARK", false) == "true"
+    if !perform_benchmark
+        @warn "Benchmark skipped, set `ENV[\"PERFORM_BENCHMARK\"] = true` to run benchmarks"
+        return nothing
+    end
     f!(X, Y) # compile first
     trial = benchmark_trial!(use_cuda, f!, X, Y)
     e = minimum(trial.times) * 1e-9 # to seconds

diff --git a/test/execution/utils_test.jl b/test/execution/utils_test.jl
@@ -21,9 +21,14 @@ function __rprint_diff(io::IO, xi, yi; pc, xname, yname) # assume we can compute
         xs = xname * string(join(pc))
         ys = yname * string(join(pc))
         println(io, "==================== Difference found:")
-        println(io, "$xs: ", xi)
-        println(io, "$ys: ", yi)
-        println(io, "($xs .- $ys): ", (xi .- yi))
+        println(io, "maximum(abs.(Δ)) = $(maximum(abs.(xi .- yi)))")
+        println(io, "maximum(abs.(xi)) = $(maximum(abs.(xi)))")
+        println(io, "maximum(abs.(yi)) = $(maximum(abs.(yi)))")
+        println(io, "extrema(xi) = $(extrema(xi))")
+        println(io, "extrema(yi) = $(extrema(yi))")
+        # println(io, "$xs: ", xi)
+        # println(io, "$ys: ", yi)
+        # println(io, "($xs .- $ys): ", (xi .- yi))
     end
     return nothing
 end
@@ -56,11 +61,18 @@ end
 
 
 # Recursively compare contents of similar types
-_rcompare(pass, x::T, y::T) where {T} = pass && (x == y)
+function _rcompare(pass, x::T, y::T; use_cuda) where {T}
+    if use_cuda
+        return pass && (x ≈ y) # CUDA doesn't always satisfy ==
+    else
+        return pass && (x == y)
+    end
+end
 
-function _rcompare(pass, x::T, y::T) where {T <: NamedTuple}
+function _rcompare(pass, x::T, y::T; use_cuda) where {T <: NamedTuple}
     for pn in propertynames(x)
-        pass &= _rcompare(pass, getproperty(x, pn), getproperty(y, pn))
+        pass &=
+            _rcompare(pass, getproperty(x, pn), getproperty(y, pn); use_cuda)
     end
     return pass
 end
@@ -71,18 +83,16 @@ end
 Recursively compare given types via `==`.
 Returns `true` if `x == y` recursively.
 """
-rcompare(x::T, y::T) where {T <: NamedTuple} = _rcompare(true, x, y)
-rcompare(x, y) = false
+rcompare(x::T, y::T; use_cuda) where {T <: NamedTuple} =
+    _rcompare(true, x, y; use_cuda)
+rcompare(x, y; use_cuda) = false
 
-function test_compare(x, y)
-    if !rcompare(x, y)
-        @rprint_diff(x, y)
-    else
-        @test rcompare(x, y)
-    end
+function test_compare(x, y; use_cuda)
+    rcompare(x, y; use_cuda) || @rprint_diff(x, y)
+    @test rcompare(x, y; use_cuda)
 end
 
-function test_kernel!(; fused!, unfused!, X, Y)
+function test_kernel!(use_cuda; fused!, unfused!, X, Y)
     for x in X
         x .= map(_ -> rand(), x)
     end
@@ -96,7 +106,7 @@ function test_kernel!(; fused!, unfused!, X, Y)
     fused!(X_fused, Y_fused)
     unfused!(X_unfused, Y_unfused)
     @testset "Test correctness of $(nameof(typeof(fused!)))" begin
-        test_compare(X_fused, X_unfused)
-        test_compare(Y_fused, Y_unfused)
+        test_compare(X_fused, X_unfused; use_cuda)
+        test_compare(Y_fused, Y_unfused; use_cuda)
     end
 end