Skip to content

Commit

Permalink
Use KernelAbstractions backend
Browse files Browse the repository at this point in the history
  • Loading branch information
charleskawczynski committed Oct 3, 2024
1 parent f3064ba commit 25f7bdc
Show file tree
Hide file tree
Showing 4 changed files with 83 additions and 73 deletions.
9 changes: 6 additions & 3 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,29 @@ uuid = "c3c07f87-98de-43f2-a76f-835b330b2cbb"
authors = ["CliMA Contributors <[email protected]>"]
version = "0.3.1"

[deps]
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"

[weakdeps]
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"

[extensions]
MultiBroadcastFusionCUDAExt = ["CUDA", "Adapt"]

[compat]
julia = "^1.9"
Adapt = "3, 4"
CUDA = "5"
julia = "^1.9"

[extras]
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d"

[targets]
Expand Down
24 changes: 1 addition & 23 deletions ext/MultiBroadcastFusionCUDAExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,30 +2,8 @@ module MultiBroadcastFusionCUDAExt

import CUDA, Adapt
import MultiBroadcastFusion as MBF
import MultiBroadcastFusion: fused_copyto!

MBF.device(x::CUDA.CuArray) = MBF.GPU()

function fused_copyto!(fmb::MBF.FusedMultiBroadcast, ::MBF.GPU)
(; pairs) = fmb
dest = first(pairs).first
nitems = length(parent(dest))
max_threads = 256 # can be higher if conditions permit
nthreads = min(max_threads, nitems)
nblocks = cld(nitems, nthreads)
CUDA.@cuda threads = (nthreads) blocks = (nblocks) fused_copyto_kernel!(fmb)
return nothing
end
function fused_copyto_kernel!(fmb::MBF.FusedMultiBroadcast)
(; pairs) = fmb
dest = first(pairs).first
nitems = length(dest)
idx = CUDA.threadIdx().x + (CUDA.blockIdx().x - 1) * CUDA.blockDim().x
if idx nitems
MBF.rcopyto_at!(pairs, idx)
end
return nothing
end
MBF.device(x::CUDA.CuArray) = MBF.MBF_CUDA()

adapt_f(to, f::F) where {F} = Adapt.adapt(to, f)
adapt_f(to, ::Type{F}) where {F} = (x...) -> F(x...)
Expand Down
119 changes: 74 additions & 45 deletions src/execution/fused_kernels.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,57 +2,86 @@
@make_fused fused_direct FusedMultiBroadcast fused_direct
@make_fused fused_assemble FusedMultiBroadcast fused_assemble

struct CPU end
struct GPU end
device(x::AbstractArray) = CPU()
import KernelAbstractions as KA
using KernelAbstractions

# For tests, we can move this out.
struct MBF_CPU end
struct MBF_CUDA end
device(x::AbstractArray) = MBF_CPU()

KA.@kernel function fused_copyto_kernel!(fmb::FusedMultiBroadcast)
(; pairs) = fmb
I = @index(Global, Cartesian)
rcopyto_at!(pairs, I)
end

function Base.copyto!(fmb::FusedMultiBroadcast)
pairs = fmb.pairs # (Pair(dest1, bc1),Pair(dest2, bc2),...)
dest = first(pairs).first
fused_copyto!(fmb, device(dest))
(; pairs) = fmb # (Pair(dest1, bc1), Pair(dest2, bc2),...)
assert_sizes(pairs)
# assert_backends(pairs) # perhaps its fine to just compare all `dest` backends
dest1 = first(pairs).first
backend = KA.get_backend(dest1)
kernel = fused_copyto_kernel!(backend)
kernel(fmb; ndrange = length(dest1))
end

Base.@propagate_inbounds function rcopyto_at!(pair::Pair, i...)
#####
##### rcopyto_at!
#####

Base.@propagate_inbounds function rcopyto_at!(pair::Pair, I)
dest, src = pair.first, pair.second
@inbounds dest[i...] = src[i...]
rcopyto_at!(dest, src, I)
return nothing
end
Base.@propagate_inbounds function rcopyto_at!(pairs::Tuple, i...)
rcopyto_at!(first(pairs), i...)
rcopyto_at!(Base.tail(pairs), i...)
# Base.@propagate_inbounds function rcopyto_at!(dest, @Const(src), I) # can't use @Const(src) here...
Base.@propagate_inbounds function rcopyto_at!(dest::AbstractVector, src, I)
@inbounds dest[I] = src[I]
return nothing
end
Base.@propagate_inbounds rcopyto_at!(pairs::Tuple{<:Any}, i...) =
rcopyto_at!(first(pairs), i...)
@inline rcopyto_at!(pairs::Tuple{}, i...) = nothing
Base.@propagate_inbounds function rcopyto_at!(dest::AbstractArray, src, I)
@inbounds dest[I] = src[I]
return nothing
end
Base.@propagate_inbounds function rcopyto_at!(pairs::Tuple, I)
rcopyto_at!(first(pairs), I)
rcopyto_at!(Base.tail(pairs), I)
end
Base.@propagate_inbounds rcopyto_at!(pairs::Tuple{<:Any}, I) =
rcopyto_at!(first(pairs), I)
@inline rcopyto_at!(pairs::Tuple{}, I) = nothing

# This is better than the baseline.
function fused_copyto!(fmb::FusedMultiBroadcast, ::CPU)
(; pairs) = fmb
destinations = map(x -> x.first, pairs)
ei = if eltype(destinations) <: Vector
eachindex(destinations...)
else
eachindex(IndexCartesian(), destinations...)
end
for (dest, bc) in pairs
@inbounds @simd ivdep for i in ei
dest[i] = bc[i]
end
end
end


# This should, in theory be better, but it seems like inlining is
# failing somewhere.
# function fused_copyto!(fmb::FusedMultiBroadcast, ::CPU)
# (; pairs) = fmb
# destinations = map(x -> x.first, pairs)
# ei = if eltype(destinations) <: Vector
# eachindex(destinations...)
# else
# eachindex(IndexCartesian(), destinations...)
# end
# @inbounds @simd ivdep for i in ei
# MBF.rcopyto_at!(pairs, i)
# end
# end
#####
##### assert_sizes
#####

Base.@propagate_inbounds function assert_sizes(pair::Pair)
dest, src = pair.first, pair.second
@assert size(dest) == size(src)
return nothing
end
Base.@propagate_inbounds function assert_sizes(pairs::Tuple)
assert_sizes(first(pairs))
assert_sizes(Base.tail(pairs))
end
Base.@propagate_inbounds assert_sizes(pairs::Tuple{<:Any}) =
assert_sizes(first(pairs))
@inline assert_sizes(pairs::Tuple{}) = nothing

#####
##### assert_backends
#####

Base.@propagate_inbounds function assert_backends(pair::Pair)
dest, src = pair.first, pair.second
@assert KA.get_backend(dest) == KA.get_backend(src)
return nothing
end
Base.@propagate_inbounds function assert_backends(pairs::Tuple)
assert_backends(first(pairs))
assert_backends(Base.tail(pairs))
end
Base.@propagate_inbounds assert_backends(pairs::Tuple{<:Any}) =
assert_backends(first(pairs))
@inline assert_backends(pairs::Tuple{}) = nothing
4 changes: 2 additions & 2 deletions test/execution/bm_fused_reads_vs_hard_coded.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import MultiBroadcastFusion as MBF
# =========================================== hard-coded implementations
perf_kernel_hard_coded!(X, Y) = perf_kernel_hard_coded!(X, Y, MBF.device(X.x1))

function perf_kernel_hard_coded!(X, Y, ::MBF.CPU)
function perf_kernel_hard_coded!(X, Y, ::MBF.MBF_CPU)
(; x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) = X
(; y1, y2, y3, y4, y5, y6, y7, y8, y9, y10) = Y
@inbounds for i in eachindex(x1)
Expand All @@ -27,7 +27,7 @@ end
@static get(ENV, "USE_CUDA", nothing) == "true" && using CUDA
use_cuda = @isdefined(CUDA) && CUDA.has_cuda() # will be true if you first run `using CUDA`
@static if use_cuda
function perf_kernel_hard_coded!(X, Y, ::MBF.GPU)
function perf_kernel_hard_coded!(X, Y, ::MBF.MBF_CUDA)
x1 = X.x1
nitems = length(parent(x1))
max_threads = 256 # can be higher if conditions permit
Expand Down

0 comments on commit 25f7bdc

Please sign in to comment.