diff --git a/Project.toml b/Project.toml index ee8d9a1..e5b2a83 100644 --- a/Project.toml +++ b/Project.toml @@ -3,25 +3,28 @@ uuid = "c3c07f87-98de-43f2-a76f-835b330b2cbb" authors = ["CliMA Contributors "] version = "0.3.1" +[deps] +KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" + [weakdeps] -CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" [extensions] MultiBroadcastFusionCUDAExt = ["CUDA", "Adapt"] [compat] -julia = "^1.9" Adapt = "3, 4" CUDA = "5" +julia = "^1.9" [extras] -Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" -CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" -SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f" BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" +SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] test = ["Test", "CUDA", "Adapt", "SafeTestsets", "BenchmarkTools", "InteractiveUtils"] diff --git a/ext/MultiBroadcastFusionCUDAExt.jl b/ext/MultiBroadcastFusionCUDAExt.jl index 3c6af49..9f7ff77 100644 --- a/ext/MultiBroadcastFusionCUDAExt.jl +++ b/ext/MultiBroadcastFusionCUDAExt.jl @@ -1,55 +1,55 @@ module MultiBroadcastFusionCUDAExt -import CUDA, Adapt -import MultiBroadcastFusion as MBF -import MultiBroadcastFusion: fused_copyto! - -MBF.device(x::CUDA.CuArray) = MBF.GPU() - -function fused_copyto!(fmb::MBF.FusedMultiBroadcast, ::MBF.GPU) - (; pairs) = fmb - dest = first(pairs).first - nitems = length(parent(dest)) - max_threads = 256 # can be higher if conditions permit - nthreads = min(max_threads, nitems) - nblocks = cld(nitems, nthreads) - CUDA.@cuda threads = (nthreads) blocks = (nblocks) fused_copyto_kernel!(fmb) - return nothing -end -function fused_copyto_kernel!(fmb::MBF.FusedMultiBroadcast) - (; pairs) = fmb - dest = first(pairs).first - nitems = length(dest) - idx = CUDA.threadIdx().x + (CUDA.blockIdx().x - 1) * CUDA.blockDim().x - if idx ≤ nitems - MBF.rcopyto_at!(pairs, idx) - end - return nothing -end - -adapt_f(to, f::F) where {F} = Adapt.adapt(to, f) -adapt_f(to, ::Type{F}) where {F} = (x...) -> F(x...) - -adapt_src(to, src::AbstractArray) = Adapt.adapt(to, src) - -function adapt_src(to, bc::Base.Broadcast.Broadcasted) - Base.Broadcast.Broadcasted( - bc.style, - adapt_f(to, bc.f), - Adapt.adapt(to, bc.args), - Adapt.adapt(to, bc.axes), - ) -end - -function Adapt.adapt_structure( - to::CUDA.KernelAdaptor, - fmbc::MBF.FusedMultiBroadcast, -) - MBF.FusedMultiBroadcast(map(fmbc.pairs) do pair - dest = pair.first - src = pair.second - Pair(Adapt.adapt(to, dest), adapt_src(to, src)) - end) -end +# import CUDA, Adapt +# import MultiBroadcastFusion as MBF +# import MultiBroadcastFusion: fused_copyto! + +# # MBF.device(x::CUDA.CuArray) = MBF.GPU() + +# function fused_copyto!(fmb::MBF.FusedMultiBroadcast, ::MBF.GPU) +# (; pairs) = fmb +# dest = first(pairs).first +# nitems = length(parent(dest)) +# max_threads = 256 # can be higher if conditions permit +# nthreads = min(max_threads, nitems) +# nblocks = cld(nitems, nthreads) +# CUDA.@cuda threads = (nthreads) blocks = (nblocks) fused_copyto_kernel!(fmb) +# return nothing +# end +# function fused_copyto_kernel!(fmb::MBF.FusedMultiBroadcast) +# (; pairs) = fmb +# dest = first(pairs).first +# nitems = length(dest) +# idx = CUDA.threadIdx().x + (CUDA.blockIdx().x - 1) * CUDA.blockDim().x +# if idx ≤ nitems +# MBF.rcopyto_at!(pairs, idx) +# end +# return nothing +# end + +# adapt_f(to, f::F) where {F} = Adapt.adapt(to, f) +# adapt_f(to, ::Type{F}) where {F} = (x...) -> F(x...) + +# adapt_src(to, src::AbstractArray) = Adapt.adapt(to, src) + +# function adapt_src(to, bc::Base.Broadcast.Broadcasted) +# Base.Broadcast.Broadcasted( +# bc.style, +# adapt_f(to, bc.f), +# Adapt.adapt(to, bc.args), +# Adapt.adapt(to, bc.axes), +# ) +# end + +# function Adapt.adapt_structure( +# to::CUDA.KernelAdaptor, +# fmbc::MBF.FusedMultiBroadcast, +# ) +# MBF.FusedMultiBroadcast(map(fmbc.pairs) do pair +# dest = pair.first +# src = pair.second +# Pair(Adapt.adapt(to, dest), adapt_src(to, src)) +# end) +# end end diff --git a/src/execution/fused_kernels.jl b/src/execution/fused_kernels.jl index f3bd891..9e0dc44 100644 --- a/src/execution/fused_kernels.jl +++ b/src/execution/fused_kernels.jl @@ -2,57 +2,86 @@ @make_fused fused_direct FusedMultiBroadcast fused_direct @make_fused fused_assemble FusedMultiBroadcast fused_assemble -struct CPU end -struct GPU end -device(x::AbstractArray) = CPU() +import KernelAbstractions as KA +using KernelAbstractions + +# For tests, we can move this out. +struct MBF_CPU end +struct MBF_GPU end +device(x::AbstractArray) = MBF_CPU() + +KA.@kernel function fused_copyto_kernel!(fmb::FusedMultiBroadcast) + (; pairs) = fmb + I = @index(Global, Cartesian) + rcopyto_at!(pairs, I) +end function Base.copyto!(fmb::FusedMultiBroadcast) - pairs = fmb.pairs # (Pair(dest1, bc1),Pair(dest2, bc2),...) - dest = first(pairs).first - fused_copyto!(fmb, device(dest)) + (; pairs) = fmb # (Pair(dest1, bc1), Pair(dest2, bc2),...) + assert_sizes(pairs) + # assert_backends(pairs) # perhaps its fine to just compare all `dest` backends + dest1 = first(pairs).first + backend = KA.get_backend(dest1) + kernel = fused_copyto_kernel!(backend) + kernel(fmb; ndrange = length(dest1)) end -Base.@propagate_inbounds function rcopyto_at!(pair::Pair, i...) +##### +##### rcopyto_at! +##### + +Base.@propagate_inbounds function rcopyto_at!(pair::Pair, I) dest, src = pair.first, pair.second - @inbounds dest[i...] = src[i...] + rcopyto_at!(dest, src, I) return nothing end -Base.@propagate_inbounds function rcopyto_at!(pairs::Tuple, i...) - rcopyto_at!(first(pairs), i...) - rcopyto_at!(Base.tail(pairs), i...) +# Base.@propagate_inbounds function rcopyto_at!(dest, @Const(src), I) # can't use @Const(src) here... +Base.@propagate_inbounds function rcopyto_at!(dest::AbstractVector, src, I) + @inbounds dest[I] = src[I] + return nothing end -Base.@propagate_inbounds rcopyto_at!(pairs::Tuple{<:Any}, i...) = - rcopyto_at!(first(pairs), i...) -@inline rcopyto_at!(pairs::Tuple{}, i...) = nothing +Base.@propagate_inbounds function rcopyto_at!(dest::AbstractArray, src, I) + @inbounds dest[I] = src[I] + return nothing +end +Base.@propagate_inbounds function rcopyto_at!(pairs::Tuple, I) + rcopyto_at!(first(pairs), I) + rcopyto_at!(Base.tail(pairs), I) +end +Base.@propagate_inbounds rcopyto_at!(pairs::Tuple{<:Any}, I) = + rcopyto_at!(first(pairs), I) +@inline rcopyto_at!(pairs::Tuple{}, I) = nothing -# This is better than the baseline. -function fused_copyto!(fmb::FusedMultiBroadcast, ::CPU) - (; pairs) = fmb - destinations = map(x -> x.first, pairs) - ei = if eltype(destinations) <: Vector - eachindex(destinations...) - else - eachindex(IndexCartesian(), destinations...) - end - for (dest, bc) in pairs - @inbounds @simd ivdep for i in ei - dest[i] = bc[i] - end - end -end - - -# This should, in theory be better, but it seems like inlining is -# failing somewhere. -# function fused_copyto!(fmb::FusedMultiBroadcast, ::CPU) -# (; pairs) = fmb -# destinations = map(x -> x.first, pairs) -# ei = if eltype(destinations) <: Vector -# eachindex(destinations...) -# else -# eachindex(IndexCartesian(), destinations...) -# end -# @inbounds @simd ivdep for i in ei -# MBF.rcopyto_at!(pairs, i) -# end -# end +##### +##### assert_sizes +##### + +Base.@propagate_inbounds function assert_sizes(pair::Pair) + dest, src = pair.first, pair.second + @assert size(dest) == size(src) + return nothing +end +Base.@propagate_inbounds function assert_sizes(pairs::Tuple) + assert_sizes(first(pairs)) + assert_sizes(Base.tail(pairs)) +end +Base.@propagate_inbounds assert_sizes(pairs::Tuple{<:Any}) = + assert_sizes(first(pairs)) +@inline assert_sizes(pairs::Tuple{}) = nothing + +##### +##### assert_backends +##### + +Base.@propagate_inbounds function assert_backends(pair::Pair) + dest, src = pair.first, pair.second + @assert KA.get_backend(dest) == KA.get_backend(src) + return nothing +end +Base.@propagate_inbounds function assert_backends(pairs::Tuple) + assert_backends(first(pairs)) + assert_backends(Base.tail(pairs)) +end +Base.@propagate_inbounds assert_backends(pairs::Tuple{<:Any}) = + assert_backends(first(pairs)) +@inline assert_backends(pairs::Tuple{}) = nothing diff --git a/test/execution/bm_fused_reads_vs_hard_coded.jl b/test/execution/bm_fused_reads_vs_hard_coded.jl index 76ce5fc..6b6d2ca 100644 --- a/test/execution/bm_fused_reads_vs_hard_coded.jl +++ b/test/execution/bm_fused_reads_vs_hard_coded.jl @@ -6,7 +6,7 @@ include("utils.jl") # =========================================== hard-coded implementations perf_kernel_hard_coded!(X, Y) = perf_kernel_hard_coded!(X, Y, MBF.device(X.x1)) -function perf_kernel_hard_coded!(X, Y, ::MBF.CPU) +function perf_kernel_hard_coded!(X, Y, ::MBF.MBF_CPU) (; x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) = X (; y1, y2, y3, y4, y5, y6, y7, y8, y9, y10) = Y @inbounds for i in eachindex(x1) @@ -19,7 +19,7 @@ function perf_kernel_hard_coded!(X, Y, ::MBF.CPU) y7[i] = x7[i] + x8[i] + x9[i] + x10[i] end end -function perf_kernel_hard_coded!(X, Y, ::MBF.GPU) +function perf_kernel_hard_coded!(X, Y, ::MBF.MBF_GPU) x1 = X.x1 nitems = length(parent(x1)) max_threads = 256 # can be higher if conditions permit diff --git a/test/execution/utils.jl b/test/execution/utils.jl index 2f4ff72..232be2c 100644 --- a/test/execution/utils.jl +++ b/test/execution/utils.jl @@ -20,9 +20,9 @@ function benchmark_kernel!(f!, args...) trial = benchmark_kernel!(MBF.device(X.x1), f!, args...) show(stdout, MIME("text/plain"), trial) end -benchmark_kernel!(::MBF.GPU, f!, args...) = +benchmark_kernel!(::MBF.MBF_GPU, f!, args...) = BenchmarkTools.@benchmark CUDA.@sync $f!($args...); -benchmark_kernel!(::MBF.CPU, f!, args...) = +benchmark_kernel!(::MBF.MBF_CPU, f!, args...) = BenchmarkTools.@benchmark $f!($args...); function show_diff(A, B)