Skip to content

Commit

Permalink
Use KernelAbstractions backend
Browse files Browse the repository at this point in the history
  • Loading branch information
charleskawczynski committed Jul 19, 2024
1 parent 50e15ff commit d6cc624
Show file tree
Hide file tree
Showing 5 changed files with 137 additions and 105 deletions.
13 changes: 8 additions & 5 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,28 @@ uuid = "c3c07f87-98de-43f2-a76f-835b330b2cbb"
authors = ["CliMA Contributors <[email protected]>"]
version = "0.3.1"

[deps]
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"

[weakdeps]
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"

[extensions]
MultiBroadcastFusionCUDAExt = ["CUDA", "Adapt"]

[compat]
julia = "^1.9"
Adapt = "3, 4"
CUDA = "5"
julia = "^1.9"

[extras]
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["Test", "CUDA", "Adapt", "SafeTestsets", "BenchmarkTools", "InteractiveUtils"]
102 changes: 51 additions & 51 deletions ext/MultiBroadcastFusionCUDAExt.jl
Original file line number Diff line number Diff line change
@@ -1,55 +1,55 @@
module MultiBroadcastFusionCUDAExt

import CUDA, Adapt
import MultiBroadcastFusion as MBF
import MultiBroadcastFusion: fused_copyto!

MBF.device(x::CUDA.CuArray) = MBF.GPU()

function fused_copyto!(fmb::MBF.FusedMultiBroadcast, ::MBF.GPU)
(; pairs) = fmb
dest = first(pairs).first
nitems = length(parent(dest))
max_threads = 256 # can be higher if conditions permit
nthreads = min(max_threads, nitems)
nblocks = cld(nitems, nthreads)
CUDA.@cuda threads = (nthreads) blocks = (nblocks) fused_copyto_kernel!(fmb)
return nothing
end
function fused_copyto_kernel!(fmb::MBF.FusedMultiBroadcast)
(; pairs) = fmb
dest = first(pairs).first
nitems = length(dest)
idx = CUDA.threadIdx().x + (CUDA.blockIdx().x - 1) * CUDA.blockDim().x
if idx nitems
MBF.rcopyto_at!(pairs, idx)
end
return nothing
end

adapt_f(to, f::F) where {F} = Adapt.adapt(to, f)
adapt_f(to, ::Type{F}) where {F} = (x...) -> F(x...)

adapt_src(to, src::AbstractArray) = Adapt.adapt(to, src)

function adapt_src(to, bc::Base.Broadcast.Broadcasted)
Base.Broadcast.Broadcasted(
bc.style,
adapt_f(to, bc.f),
Adapt.adapt(to, bc.args),
Adapt.adapt(to, bc.axes),
)
end

function Adapt.adapt_structure(
to::CUDA.KernelAdaptor,
fmbc::MBF.FusedMultiBroadcast,
)
MBF.FusedMultiBroadcast(map(fmbc.pairs) do pair
dest = pair.first
src = pair.second
Pair(Adapt.adapt(to, dest), adapt_src(to, src))
end)
end
# import CUDA, Adapt
# import MultiBroadcastFusion as MBF
# import MultiBroadcastFusion: fused_copyto!

# # MBF.device(x::CUDA.CuArray) = MBF.GPU()

# function fused_copyto!(fmb::MBF.FusedMultiBroadcast, ::MBF.GPU)
# (; pairs) = fmb
# dest = first(pairs).first
# nitems = length(parent(dest))
# max_threads = 256 # can be higher if conditions permit
# nthreads = min(max_threads, nitems)
# nblocks = cld(nitems, nthreads)
# CUDA.@cuda threads = (nthreads) blocks = (nblocks) fused_copyto_kernel!(fmb)
# return nothing
# end
# function fused_copyto_kernel!(fmb::MBF.FusedMultiBroadcast)
# (; pairs) = fmb
# dest = first(pairs).first
# nitems = length(dest)
# idx = CUDA.threadIdx().x + (CUDA.blockIdx().x - 1) * CUDA.blockDim().x
# if idx ≤ nitems
# MBF.rcopyto_at!(pairs, idx)
# end
# return nothing
# end

# adapt_f(to, f::F) where {F} = Adapt.adapt(to, f)
# adapt_f(to, ::Type{F}) where {F} = (x...) -> F(x...)

# adapt_src(to, src::AbstractArray) = Adapt.adapt(to, src)

# function adapt_src(to, bc::Base.Broadcast.Broadcasted)
# Base.Broadcast.Broadcasted(
# bc.style,
# adapt_f(to, bc.f),
# Adapt.adapt(to, bc.args),
# Adapt.adapt(to, bc.axes),
# )
# end

# function Adapt.adapt_structure(
# to::CUDA.KernelAdaptor,
# fmbc::MBF.FusedMultiBroadcast,
# )
# MBF.FusedMultiBroadcast(map(fmbc.pairs) do pair
# dest = pair.first
# src = pair.second
# Pair(Adapt.adapt(to, dest), adapt_src(to, src))
# end)
# end

end
119 changes: 74 additions & 45 deletions src/execution/fused_kernels.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,57 +2,86 @@
@make_fused fused_direct FusedMultiBroadcast fused_direct
@make_fused fused_assemble FusedMultiBroadcast fused_assemble

struct CPU end
struct GPU end
device(x::AbstractArray) = CPU()
import KernelAbstractions as KA
using KernelAbstractions

# For tests, we can move this out.
struct MBF_CPU end
struct MBF_GPU end
device(x::AbstractArray) = MBF_CPU()

KA.@kernel function fused_copyto_kernel!(fmb::FusedMultiBroadcast)
(; pairs) = fmb
I = @index(Global, Cartesian)
rcopyto_at!(pairs, I)
end

function Base.copyto!(fmb::FusedMultiBroadcast)
pairs = fmb.pairs # (Pair(dest1, bc1),Pair(dest2, bc2),...)
dest = first(pairs).first
fused_copyto!(fmb, device(dest))
(; pairs) = fmb # (Pair(dest1, bc1), Pair(dest2, bc2),...)
assert_sizes(pairs)
# assert_backends(pairs) # perhaps its fine to just compare all `dest` backends
dest1 = first(pairs).first
backend = KA.get_backend(dest1)
kernel = fused_copyto_kernel!(backend)
kernel(fmb; ndrange = length(dest1))
end

Base.@propagate_inbounds function rcopyto_at!(pair::Pair, i...)
#####
##### rcopyto_at!
#####

Base.@propagate_inbounds function rcopyto_at!(pair::Pair, I)
dest, src = pair.first, pair.second
@inbounds dest[i...] = src[i...]
rcopyto_at!(dest, src, I)
return nothing
end
Base.@propagate_inbounds function rcopyto_at!(pairs::Tuple, i...)
rcopyto_at!(first(pairs), i...)
rcopyto_at!(Base.tail(pairs), i...)
# Base.@propagate_inbounds function rcopyto_at!(dest, @Const(src), I) # can't use @Const(src) here...
Base.@propagate_inbounds function rcopyto_at!(dest::AbstractVector, src, I)
@inbounds dest[I] = src[I]
return nothing
end
Base.@propagate_inbounds rcopyto_at!(pairs::Tuple{<:Any}, i...) =
rcopyto_at!(first(pairs), i...)
@inline rcopyto_at!(pairs::Tuple{}, i...) = nothing
Base.@propagate_inbounds function rcopyto_at!(dest::AbstractArray, src, I)
@inbounds dest[I] = src[I]
return nothing
end
Base.@propagate_inbounds function rcopyto_at!(pairs::Tuple, I)
rcopyto_at!(first(pairs), I)
rcopyto_at!(Base.tail(pairs), I)
end
Base.@propagate_inbounds rcopyto_at!(pairs::Tuple{<:Any}, I) =
rcopyto_at!(first(pairs), I)
@inline rcopyto_at!(pairs::Tuple{}, I) = nothing

# This is better than the baseline.
function fused_copyto!(fmb::FusedMultiBroadcast, ::CPU)
(; pairs) = fmb
destinations = map(x -> x.first, pairs)
ei = if eltype(destinations) <: Vector
eachindex(destinations...)
else
eachindex(IndexCartesian(), destinations...)
end
for (dest, bc) in pairs
@inbounds @simd ivdep for i in ei
dest[i] = bc[i]
end
end
end


# This should, in theory be better, but it seems like inlining is
# failing somewhere.
# function fused_copyto!(fmb::FusedMultiBroadcast, ::CPU)
# (; pairs) = fmb
# destinations = map(x -> x.first, pairs)
# ei = if eltype(destinations) <: Vector
# eachindex(destinations...)
# else
# eachindex(IndexCartesian(), destinations...)
# end
# @inbounds @simd ivdep for i in ei
# MBF.rcopyto_at!(pairs, i)
# end
# end
#####
##### assert_sizes
#####

Base.@propagate_inbounds function assert_sizes(pair::Pair)
dest, src = pair.first, pair.second
@assert size(dest) == size(src)
return nothing
end
Base.@propagate_inbounds function assert_sizes(pairs::Tuple)
assert_sizes(first(pairs))
assert_sizes(Base.tail(pairs))
end
Base.@propagate_inbounds assert_sizes(pairs::Tuple{<:Any}) =
assert_sizes(first(pairs))
@inline assert_sizes(pairs::Tuple{}) = nothing

#####
##### assert_backends
#####

Base.@propagate_inbounds function assert_backends(pair::Pair)
dest, src = pair.first, pair.second
@assert KA.get_backend(dest) == KA.get_backend(src)
return nothing
end
Base.@propagate_inbounds function assert_backends(pairs::Tuple)
assert_backends(first(pairs))
assert_backends(Base.tail(pairs))
end
Base.@propagate_inbounds assert_backends(pairs::Tuple{<:Any}) =
assert_backends(first(pairs))
@inline assert_backends(pairs::Tuple{}) = nothing
4 changes: 2 additions & 2 deletions test/execution/bm_fused_reads_vs_hard_coded.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ include("utils.jl")
# =========================================== hard-coded implementations
perf_kernel_hard_coded!(X, Y) = perf_kernel_hard_coded!(X, Y, MBF.device(X.x1))

function perf_kernel_hard_coded!(X, Y, ::MBF.CPU)
function perf_kernel_hard_coded!(X, Y, ::MBF.MBF_CPU)
(; x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) = X
(; y1, y2, y3, y4, y5, y6, y7, y8, y9, y10) = Y
@inbounds for i in eachindex(x1)
Expand All @@ -19,7 +19,7 @@ function perf_kernel_hard_coded!(X, Y, ::MBF.CPU)
y7[i] = x7[i] + x8[i] + x9[i] + x10[i]
end
end
function perf_kernel_hard_coded!(X, Y, ::MBF.GPU)
function perf_kernel_hard_coded!(X, Y, ::MBF.MBF_GPU)
x1 = X.x1
nitems = length(parent(x1))
max_threads = 256 # can be higher if conditions permit
Expand Down
4 changes: 2 additions & 2 deletions test/execution/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@ function benchmark_kernel!(f!, args...)
trial = benchmark_kernel!(MBF.device(X.x1), f!, args...)
show(stdout, MIME("text/plain"), trial)
end
benchmark_kernel!(::MBF.GPU, f!, args...) =
benchmark_kernel!(::MBF.MBF_GPU, f!, args...) =
BenchmarkTools.@benchmark CUDA.@sync $f!($args...);
benchmark_kernel!(::MBF.CPU, f!, args...) =
benchmark_kernel!(::MBF.MBF_CPU, f!, args...) =
BenchmarkTools.@benchmark $f!($args...);

function show_diff(A, B)
Expand Down

0 comments on commit d6cc624

Please sign in to comment.