diff --git a/Project.toml b/Project.toml
index ee8d9a1..e5b2a83 100644
--- a/Project.toml
+++ b/Project.toml
@@ -3,25 +3,28 @@ uuid = "c3c07f87-98de-43f2-a76f-835b330b2cbb"
 authors = ["CliMA Contributors <clima-software@caltech.edu>"]
 version = "0.3.1"
 
+[deps]
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
+
 [weakdeps]
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 
 [extensions]
 MultiBroadcastFusionCUDAExt = ["CUDA", "Adapt"]
 
 [compat]
-julia = "^1.9"
 Adapt = "3, 4"
 CUDA = "5"
+julia = "^1.9"
 
 [extras]
-Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
-SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
+SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
 test = ["Test", "CUDA", "Adapt", "SafeTestsets", "BenchmarkTools", "InteractiveUtils"]
diff --git a/ext/MultiBroadcastFusionCUDAExt.jl b/ext/MultiBroadcastFusionCUDAExt.jl
index 3c6af49..9f7ff77 100644
--- a/ext/MultiBroadcastFusionCUDAExt.jl
+++ b/ext/MultiBroadcastFusionCUDAExt.jl
@@ -1,55 +1,55 @@
 module MultiBroadcastFusionCUDAExt
 
-import CUDA, Adapt
-import MultiBroadcastFusion as MBF
-import MultiBroadcastFusion: fused_copyto!
-
-MBF.device(x::CUDA.CuArray) = MBF.GPU()
-
-function fused_copyto!(fmb::MBF.FusedMultiBroadcast, ::MBF.GPU)
-    (; pairs) = fmb
-    dest = first(pairs).first
-    nitems = length(parent(dest))
-    max_threads = 256 # can be higher if conditions permit
-    nthreads = min(max_threads, nitems)
-    nblocks = cld(nitems, nthreads)
-    CUDA.@cuda threads = (nthreads) blocks = (nblocks) fused_copyto_kernel!(fmb)
-    return nothing
-end
-function fused_copyto_kernel!(fmb::MBF.FusedMultiBroadcast)
-    (; pairs) = fmb
-    dest = first(pairs).first
-    nitems = length(dest)
-    idx = CUDA.threadIdx().x + (CUDA.blockIdx().x - 1) * CUDA.blockDim().x
-    if idx ≤ nitems
-        MBF.rcopyto_at!(pairs, idx)
-    end
-    return nothing
-end
-
-adapt_f(to, f::F) where {F} = Adapt.adapt(to, f)
-adapt_f(to, ::Type{F}) where {F} = (x...) -> F(x...)
-
-adapt_src(to, src::AbstractArray) = Adapt.adapt(to, src)
-
-function adapt_src(to, bc::Base.Broadcast.Broadcasted)
-    Base.Broadcast.Broadcasted(
-        bc.style,
-        adapt_f(to, bc.f),
-        Adapt.adapt(to, bc.args),
-        Adapt.adapt(to, bc.axes),
-    )
-end
-
-function Adapt.adapt_structure(
-    to::CUDA.KernelAdaptor,
-    fmbc::MBF.FusedMultiBroadcast,
-)
-    MBF.FusedMultiBroadcast(map(fmbc.pairs) do pair
-        dest = pair.first
-        src = pair.second
-        Pair(Adapt.adapt(to, dest), adapt_src(to, src))
-    end)
-end
+# import CUDA, Adapt
+# import MultiBroadcastFusion as MBF
+# import MultiBroadcastFusion: fused_copyto!
+
+# # MBF.device(x::CUDA.CuArray) = MBF.GPU()
+
+# function fused_copyto!(fmb::MBF.FusedMultiBroadcast, ::MBF.GPU)
+#     (; pairs) = fmb
+#     dest = first(pairs).first
+#     nitems = length(parent(dest))
+#     max_threads = 256 # can be higher if conditions permit
+#     nthreads = min(max_threads, nitems)
+#     nblocks = cld(nitems, nthreads)
+#     CUDA.@cuda threads = (nthreads) blocks = (nblocks) fused_copyto_kernel!(fmb)
+#     return nothing
+# end
+# function fused_copyto_kernel!(fmb::MBF.FusedMultiBroadcast)
+#     (; pairs) = fmb
+#     dest = first(pairs).first
+#     nitems = length(dest)
+#     idx = CUDA.threadIdx().x + (CUDA.blockIdx().x - 1) * CUDA.blockDim().x
+#     if idx ≤ nitems
+#         MBF.rcopyto_at!(pairs, idx)
+#     end
+#     return nothing
+# end
+
+# adapt_f(to, f::F) where {F} = Adapt.adapt(to, f)
+# adapt_f(to, ::Type{F}) where {F} = (x...) -> F(x...)
+
+# adapt_src(to, src::AbstractArray) = Adapt.adapt(to, src)
+
+# function adapt_src(to, bc::Base.Broadcast.Broadcasted)
+#     Base.Broadcast.Broadcasted(
+#         bc.style,
+#         adapt_f(to, bc.f),
+#         Adapt.adapt(to, bc.args),
+#         Adapt.adapt(to, bc.axes),
+#     )
+# end
+
+# function Adapt.adapt_structure(
+#     to::CUDA.KernelAdaptor,
+#     fmbc::MBF.FusedMultiBroadcast,
+# )
+#     MBF.FusedMultiBroadcast(map(fmbc.pairs) do pair
+#         dest = pair.first
+#         src = pair.second
+#         Pair(Adapt.adapt(to, dest), adapt_src(to, src))
+#     end)
+# end
 
 end
diff --git a/src/execution/fused_kernels.jl b/src/execution/fused_kernels.jl
index f3bd891..9e0dc44 100644
--- a/src/execution/fused_kernels.jl
+++ b/src/execution/fused_kernels.jl
@@ -2,57 +2,86 @@
 @make_fused fused_direct FusedMultiBroadcast fused_direct
 @make_fused fused_assemble FusedMultiBroadcast fused_assemble
 
-struct CPU end
-struct GPU end
-device(x::AbstractArray) = CPU()
+import KernelAbstractions as KA
+using KernelAbstractions
+
+# For tests, we can move this out.
+struct MBF_CPU end
+struct MBF_GPU end
+device(x::AbstractArray) = MBF_CPU()
+
+KA.@kernel function fused_copyto_kernel!(fmb::FusedMultiBroadcast)
+    (; pairs) = fmb
+    I = @index(Global, Cartesian)
+    rcopyto_at!(pairs, I)
+end
 
 function Base.copyto!(fmb::FusedMultiBroadcast)
-    pairs = fmb.pairs # (Pair(dest1, bc1),Pair(dest2, bc2),...)
-    dest = first(pairs).first
-    fused_copyto!(fmb, device(dest))
+    (; pairs) = fmb # (Pair(dest1, bc1), Pair(dest2, bc2),...)
+    assert_sizes(pairs)
+    # assert_backends(pairs) # perhaps its fine to just compare all `dest` backends
+    dest1 = first(pairs).first
+    backend = KA.get_backend(dest1)
+    kernel = fused_copyto_kernel!(backend)
+    kernel(fmb; ndrange = length(dest1))
 end
 
-Base.@propagate_inbounds function rcopyto_at!(pair::Pair, i...)
+#####
+##### rcopyto_at!
+#####
+
+Base.@propagate_inbounds function rcopyto_at!(pair::Pair, I)
     dest, src = pair.first, pair.second
-    @inbounds dest[i...] = src[i...]
+    rcopyto_at!(dest, src, I)
     return nothing
 end
-Base.@propagate_inbounds function rcopyto_at!(pairs::Tuple, i...)
-    rcopyto_at!(first(pairs), i...)
-    rcopyto_at!(Base.tail(pairs), i...)
+# Base.@propagate_inbounds function rcopyto_at!(dest, @Const(src), I) # can't use @Const(src) here...
+Base.@propagate_inbounds function rcopyto_at!(dest::AbstractVector, src, I)
+    @inbounds dest[I] = src[I]
+    return nothing
 end
-Base.@propagate_inbounds rcopyto_at!(pairs::Tuple{<:Any}, i...) =
-    rcopyto_at!(first(pairs), i...)
-@inline rcopyto_at!(pairs::Tuple{}, i...) = nothing
+Base.@propagate_inbounds function rcopyto_at!(dest::AbstractArray, src, I)
+    @inbounds dest[I] = src[I]
+    return nothing
+end
+Base.@propagate_inbounds function rcopyto_at!(pairs::Tuple, I)
+    rcopyto_at!(first(pairs), I)
+    rcopyto_at!(Base.tail(pairs), I)
+end
+Base.@propagate_inbounds rcopyto_at!(pairs::Tuple{<:Any}, I) =
+    rcopyto_at!(first(pairs), I)
+@inline rcopyto_at!(pairs::Tuple{}, I) = nothing
 
-# This is better than the baseline.
-function fused_copyto!(fmb::FusedMultiBroadcast, ::CPU)
-    (; pairs) = fmb
-    destinations = map(x -> x.first, pairs)
-    ei = if eltype(destinations) <: Vector
-        eachindex(destinations...)
-    else
-        eachindex(IndexCartesian(), destinations...)
-    end
-    for (dest, bc) in pairs
-        @inbounds @simd ivdep for i in ei
-            dest[i] = bc[i]
-        end
-    end
-end
-
-
-# This should, in theory be better, but it seems like inlining is
-# failing somewhere.
-# function fused_copyto!(fmb::FusedMultiBroadcast, ::CPU)
-#     (; pairs) = fmb
-#     destinations = map(x -> x.first, pairs)
-#     ei = if eltype(destinations) <: Vector
-#         eachindex(destinations...)
-#     else
-#         eachindex(IndexCartesian(), destinations...)
-#     end
-#     @inbounds @simd ivdep for i in ei
-#         MBF.rcopyto_at!(pairs, i)
-#     end
-# end
+#####
+##### assert_sizes
+#####
+
+Base.@propagate_inbounds function assert_sizes(pair::Pair)
+    dest, src = pair.first, pair.second
+    @assert size(dest) == size(src)
+    return nothing
+end
+Base.@propagate_inbounds function assert_sizes(pairs::Tuple)
+    assert_sizes(first(pairs))
+    assert_sizes(Base.tail(pairs))
+end
+Base.@propagate_inbounds assert_sizes(pairs::Tuple{<:Any}) =
+    assert_sizes(first(pairs))
+@inline assert_sizes(pairs::Tuple{}) = nothing
+
+#####
+##### assert_backends
+#####
+
+Base.@propagate_inbounds function assert_backends(pair::Pair)
+    dest, src = pair.first, pair.second
+    @assert KA.get_backend(dest) == KA.get_backend(src)
+    return nothing
+end
+Base.@propagate_inbounds function assert_backends(pairs::Tuple)
+    assert_backends(first(pairs))
+    assert_backends(Base.tail(pairs))
+end
+Base.@propagate_inbounds assert_backends(pairs::Tuple{<:Any}) =
+    assert_backends(first(pairs))
+@inline assert_backends(pairs::Tuple{}) = nothing
diff --git a/test/execution/bm_fused_reads_vs_hard_coded.jl b/test/execution/bm_fused_reads_vs_hard_coded.jl
index 76ce5fc..6b6d2ca 100644
--- a/test/execution/bm_fused_reads_vs_hard_coded.jl
+++ b/test/execution/bm_fused_reads_vs_hard_coded.jl
@@ -6,7 +6,7 @@ include("utils.jl")
 # =========================================== hard-coded implementations
 perf_kernel_hard_coded!(X, Y) = perf_kernel_hard_coded!(X, Y, MBF.device(X.x1))
 
-function perf_kernel_hard_coded!(X, Y, ::MBF.CPU)
+function perf_kernel_hard_coded!(X, Y, ::MBF.MBF_CPU)
     (; x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) = X
     (; y1, y2, y3, y4, y5, y6, y7, y8, y9, y10) = Y
     @inbounds for i in eachindex(x1)
@@ -19,7 +19,7 @@ function perf_kernel_hard_coded!(X, Y, ::MBF.CPU)
         y7[i] = x7[i] + x8[i] + x9[i] + x10[i]
     end
 end
-function perf_kernel_hard_coded!(X, Y, ::MBF.GPU)
+function perf_kernel_hard_coded!(X, Y, ::MBF.MBF_GPU)
     x1 = X.x1
     nitems = length(parent(x1))
     max_threads = 256 # can be higher if conditions permit
diff --git a/test/execution/utils.jl b/test/execution/utils.jl
index 2f4ff72..232be2c 100644
--- a/test/execution/utils.jl
+++ b/test/execution/utils.jl
@@ -20,9 +20,9 @@ function benchmark_kernel!(f!, args...)
     trial = benchmark_kernel!(MBF.device(X.x1), f!, args...)
     show(stdout, MIME("text/plain"), trial)
 end
-benchmark_kernel!(::MBF.GPU, f!, args...) =
+benchmark_kernel!(::MBF.MBF_GPU, f!, args...) =
     BenchmarkTools.@benchmark CUDA.@sync $f!($args...);
-benchmark_kernel!(::MBF.CPU, f!, args...) =
+benchmark_kernel!(::MBF.MBF_CPU, f!, args...) =
     BenchmarkTools.@benchmark $f!($args...);
 
 function show_diff(A, B)