diff --git a/Project.toml b/Project.toml index 966914c..a3ddf36 100644 --- a/Project.toml +++ b/Project.toml @@ -3,26 +3,29 @@ uuid = "c3c07f87-98de-43f2-a76f-835b330b2cbb" authors = ["CliMA Contributors "] version = "0.3.2" +[deps] +KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" + [weakdeps] -CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" [extensions] MultiBroadcastFusionCUDAExt = ["CUDA", "Adapt"] [compat] -julia = "^1.9" Adapt = "3, 4" CUDA = "5" +julia = "^1.9" [extras] -Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f" BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d" [targets] diff --git a/ext/MultiBroadcastFusionCUDAExt.jl b/ext/MultiBroadcastFusionCUDAExt.jl index 225746f..8c3a94b 100644 --- a/ext/MultiBroadcastFusionCUDAExt.jl +++ b/ext/MultiBroadcastFusionCUDAExt.jl @@ -2,45 +2,9 @@ module MultiBroadcastFusionCUDAExt import CUDA, Adapt import MultiBroadcastFusion as MBF -import MultiBroadcastFusion: fused_copyto! MBF.device(x::CUDA.CuArray) = MBF.MBF_CUDA() -function fused_copyto!(fmb::MBF.FusedMultiBroadcast, ::MBF.MBF_CUDA) - (; pairs) = fmb - dest = first(pairs).first - destinations = map(p -> p.first, pairs) - all(a -> axes(a) == axes(dest), destinations) || - error("Cannot fuse broadcast expressions with unequal broadcast axes") - nitems = length(parent(dest)) - CI = CartesianIndices(axes(dest)) - kernel = - CUDA.@cuda always_inline = true launch = false fused_copyto_kernel!( - fmb, - CI, - ) - config = CUDA.launch_configuration(kernel.fun) - threads = min(nitems, config.threads) - blocks = cld(nitems, threads) - kernel(fmb, CI; threads, blocks) - return destinations -end -import Base.Broadcast -function fused_copyto_kernel!(fmb::MBF.FusedMultiBroadcast, CI) - @inbounds begin - (; pairs) = fmb - dest = first(pairs).first - nitems = length(dest) - idx = - CUDA.threadIdx().x + - (CUDA.blockIdx().x - Int32(1)) * CUDA.blockDim().x - if 1 ≤ idx ≤ nitems - MBF.rcopyto_at!(pairs, CI[idx]) - end - end - return nothing -end - adapt_f(to, f::F) where {F} = Adapt.adapt(to, f) adapt_f(to, ::Type{F}) where {F} = (x...) -> F(x...) diff --git a/src/execution/fused_kernels.jl b/src/execution/fused_kernels.jl index 67608c6..025dac7 100644 --- a/src/execution/fused_kernels.jl +++ b/src/execution/fused_kernels.jl @@ -2,10 +2,20 @@ @make_fused fused_direct FusedMultiBroadcast fused_direct @make_fused fused_assemble FusedMultiBroadcast fused_assemble +import KernelAbstractions as KA +using KernelAbstractions + +# For tests, we can move this out. struct MBF_CPU end struct MBF_CUDA end device(x::AbstractArray) = MBF_CPU() +KA.@kernel function fused_copyto_kernel!(fmb::FusedMultiBroadcast) + (; pairs) = fmb + I = @index(Global, Cartesian) + rcopyto_at!(pairs, I) +end + function Base.copyto!(fmb::FusedMultiBroadcast) # Since we intercept Base.copyto!, we have not yet # called Base.Broadcast.instantiate (as this is done @@ -18,59 +28,71 @@ function Base.copyto!(fmb::FusedMultiBroadcast) ) (; pairs) = fmb′ # (Pair(dest1, bc1),Pair(dest2, bc2),...) dest = first(pairs).first - fused_copyto!(fmb′, device(dest)) + + assert_sizes(pairs) + # assert_backends(pairs) # perhaps its fine to just compare all `dest` backends? + dest1 = first(pairs).first + backend = KA.get_backend(dest1) + kernel = fused_copyto_kernel!(backend) + kernel(fmb′; ndrange = size(dest1)) end -Base.@propagate_inbounds function rcopyto_at!( - pair::Pair, - i::Vararg{T}, -) where {T} +##### +##### rcopyto_at! +##### + +Base.@propagate_inbounds function rcopyto_at!(pair::Pair, I) dest, src = pair.first, pair.second - @inbounds dest[i...] = src[i...] + rcopyto_at!(dest, src, I) + return nothing +end +# Base.@propagate_inbounds function rcopyto_at!(dest, @Const(src), I) # can't use @Const(src) here... +Base.@propagate_inbounds function rcopyto_at!(dest::AbstractVector, src, I) + @inbounds dest[I] = src[I] + return nothing +end +Base.@propagate_inbounds function rcopyto_at!(dest::AbstractArray, src, I) + @inbounds dest[I] = src[I] return nothing end -Base.@propagate_inbounds function rcopyto_at!( - pairs::Tuple, - i::Vararg{T}, -) where {T} - rcopyto_at!(first(pairs), i...) - rcopyto_at!(Base.tail(pairs), i...) +Base.@propagate_inbounds function rcopyto_at!(pairs::Tuple, I) + rcopyto_at!(first(pairs), I) + rcopyto_at!(Base.tail(pairs), I) end -Base.@propagate_inbounds rcopyto_at!( - pairs::Tuple{<:Any}, - i::Vararg{T}, -) where {T} = rcopyto_at!(first(pairs), i...) -@inline rcopyto_at!(pairs::Tuple{}, i::Vararg{T}) where {T} = nothing +Base.@propagate_inbounds rcopyto_at!(pairs::Tuple{<:Any}, I) = + rcopyto_at!(first(pairs), I) +@inline rcopyto_at!(pairs::Tuple{}, I) = nothing -# This is better than the baseline. -function fused_copyto!(fmb::FusedMultiBroadcast, ::MBF_CPU) - (; pairs) = fmb - destinations = map(x -> x.first, pairs) - ei = if eltype(destinations) <: Vector - eachindex(destinations...) - else - eachindex(IndexCartesian(), destinations...) - end - for (dest, bc) in pairs - @inbounds @simd ivdep for i in ei - dest[i] = bc[i] - end - end - return destinations +##### +##### assert_sizes +##### + +Base.@propagate_inbounds function assert_sizes(pair::Pair) + dest, src = pair.first, pair.second + @assert size(dest) == size(src) + return nothing +end +Base.@propagate_inbounds function assert_sizes(pairs::Tuple) + assert_sizes(first(pairs)) + assert_sizes(Base.tail(pairs)) end +Base.@propagate_inbounds assert_sizes(pairs::Tuple{<:Any}) = + assert_sizes(first(pairs)) +@inline assert_sizes(pairs::Tuple{}) = nothing +##### +##### assert_backends +##### -# This should, in theory be better, but it seems like inlining is -# failing somewhere. -# function fused_copyto!(fmb::FusedMultiBroadcast, ::MBF_CPU) -# (; pairs) = fmb -# destinations = map(x -> x.first, pairs) -# ei = if eltype(destinations) <: Vector -# eachindex(destinations...) -# else -# eachindex(IndexCartesian(), destinations...) -# end -# @inbounds @simd ivdep for i in ei -# MBF.rcopyto_at!(pairs, i) -# end -# end +Base.@propagate_inbounds function assert_backends(pair::Pair) + dest, src = pair.first, pair.second + @assert KA.get_backend(dest) == KA.get_backend(src) + return nothing +end +Base.@propagate_inbounds function assert_backends(pairs::Tuple) + assert_backends(first(pairs)) + assert_backends(Base.tail(pairs)) +end +Base.@propagate_inbounds assert_backends(pairs::Tuple{<:Any}) = + assert_backends(first(pairs)) +@inline assert_backends(pairs::Tuple{}) = nothing diff --git a/test/execution/runtests.jl b/test/execution/runtests.jl index 6c5d4c0..7fb3763 100644 --- a/test/execution/runtests.jl +++ b/test/execution/runtests.jl @@ -1,6 +1,8 @@ #= using Revise; include(joinpath("test", "execution", "runtests.jl")) =# +using Test +using SafeTestsets #! format: off @safetestset "fused_shared_reads" begin; @time include("bm_fused_shared_reads.jl"); end diff --git a/test/runtests.jl b/test/runtests.jl index 500464d..7d480f5 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,7 +1,6 @@ #= julia --project -using TestEnv -TestEnv.activate() +using TestEnv; TestEnv.activate() using CUDA; ENV["PERFORM_BENCHMARK"]="true";