diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index a3ff316..977a369 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -39,15 +39,35 @@ steps:
 
       - label: "CPU tests"
         key: tests_cpu
-        command: "julia --color=yes --project=.buildkite test/runtests.jl"
+        command: "julia --color=yes --check-bounds=yes --project=.buildkite test/runtests.jl"
 
       - label: "CUDA tests"
         key: tests_cuda
+        command:
+          - "julia --project=.buildkite -e 'using CUDA; CUDA.versioninfo()'"
+          - "julia --color=yes --check-bounds=yes --project=.buildkite test/runtests.jl"
+        env:
+          USE_CUDA: "true"
+        agents:
+          slurm_gpus: 1
+
+  - group: "Benchmarks"
+    steps:
+
+      - label: "CPU benchmarks"
+        key: bm_cpu
+        command: "julia --color=yes --project=.buildkite test/runtests.jl"
+        env:
+          PERFORM_BENCHMARKS: "true"
+
+      - label: "CUDA benchmarks"
+        key: bm_cuda
         command:
           - "julia --project=.buildkite -e 'using CUDA; CUDA.versioninfo()'"
           - "julia --color=yes --project=.buildkite test/runtests.jl"
         env:
           USE_CUDA: "true"
+          PERFORM_BENCHMARKS: "true"
         agents:
           slurm_gpus: 1
 
diff --git a/ext/MultiBroadcastFusionCUDAExt.jl b/ext/MultiBroadcastFusionCUDAExt.jl
index 61e3895..68459de 100644
--- a/ext/MultiBroadcastFusionCUDAExt.jl
+++ b/ext/MultiBroadcastFusionCUDAExt.jl
@@ -9,20 +9,29 @@ MBF.device(x::CUDA.CuArray) = MBF.MBF_CUDA()
 function fused_copyto!(fmb::MBF.FusedMultiBroadcast, ::MBF.MBF_CUDA)
     (; pairs) = fmb
     dest = first(pairs).first
+    destinations = map(p -> p.first, pairs)
     nitems = length(parent(dest))
     max_threads = 256 # can be higher if conditions permit
     nthreads = min(max_threads, nitems)
     nblocks = cld(nitems, nthreads)
-    CUDA.@cuda threads = (nthreads) blocks = (nblocks) fused_copyto_kernel!(fmb)
+    a1 = axes(dest)
+    all(a -> axes(a) == axes(dest), destinations) ||
+        error("Cannot fuse broadcast expressions with unequal broadcast axes")
+    CI = CartesianIndices(axes(dest))
+    CUDA.@cuda threads = (nthreads) blocks = (nblocks) fused_copyto_kernel!(
+        fmb,
+        CI,
+    )
     return nothing
 end
-function fused_copyto_kernel!(fmb::MBF.FusedMultiBroadcast)
+import Base.Broadcast
+function fused_copyto_kernel!(fmb::MBF.FusedMultiBroadcast, CI)
     (; pairs) = fmb
     dest = first(pairs).first
     nitems = length(dest)
     idx = CUDA.threadIdx().x + (CUDA.blockIdx().x - 1) * CUDA.blockDim().x
     if idx ≤ nitems
-        MBF.rcopyto_at!(pairs, idx)
+        MBF.rcopyto_at!(pairs, CI[idx])
     end
     return nothing
 end
diff --git a/test/execution/utils_benchmark.jl b/test/execution/utils_benchmark.jl
index 97c2c9e..5539e38 100644
--- a/test/execution/utils_benchmark.jl
+++ b/test/execution/utils_benchmark.jl
@@ -60,6 +60,11 @@ trunc_time(s::String) = count(',', s) > 1 ? join(split(s, ",")[1:2], ",") : s
 
 import PrettyTables
 function tabulate_benchmark(bm)
+    perform_benchmark = get(ENV, "PERFORM_BENCHMARK", false) == "true"
+    if !perform_benchmark
+        @warn "Benchmark skipped, set `ENV[\"PERFORM_BENCHMARK\"] = true` to run benchmarks"
+        return nothing
+    end
     funcs = map(x -> strip(x.caller), bm.data)
     timings = map(x -> time_and_units_str(x.kernel_time_s), bm.data)
     n_reads_writes = map(x -> x.n_reads_writes, bm.data)
@@ -159,6 +164,11 @@ function benchmark_trial!(use_cuda, f!, X, Y)
 end
 
 function push_benchmark!(bm, use_cuda, f!, X, Y; n_reads_writes, problem_size)
+    perform_benchmark = get(ENV, "PERFORM_BENCHMARK", false) == "true"
+    if !perform_benchmark
+        @warn "Benchmark skipped, set `ENV[\"PERFORM_BENCHMARK\"] = true` to run benchmarks"
+        return nothing
+    end
     f!(X, Y) # compile first
     trial = benchmark_trial!(use_cuda, f!, X, Y)
     e = minimum(trial.times) * 1e-9 # to seconds
diff --git a/test/execution/utils_test.jl b/test/execution/utils_test.jl
index e9bc638..f569b63 100644
--- a/test/execution/utils_test.jl
+++ b/test/execution/utils_test.jl
@@ -75,11 +75,8 @@ rcompare(x::T, y::T) where {T <: NamedTuple} = _rcompare(true, x, y)
 rcompare(x, y) = false
 
 function test_compare(x, y)
-    if !rcompare(x, y)
-        @rprint_diff(x, y)
-    else
-        @test rcompare(x, y)
-    end
+    rcompare(x, y) || @rprint_diff(x, y)
+    @test rcompare(x, y)
 end
 
 function test_kernel!(; fused!, unfused!, X, Y)