Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use KernelAbstractions backend #33

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,29 @@ uuid = "c3c07f87-98de-43f2-a76f-835b330b2cbb"
authors = ["CliMA Contributors <[email protected]>"]
version = "0.3.2"

[deps]
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"

[weakdeps]
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"

[extensions]
MultiBroadcastFusionCUDAExt = ["CUDA", "Adapt"]

[compat]
julia = "^1.9"
Adapt = "3, 4"
CUDA = "5"
julia = "^1.9"

[extras]
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d"

[targets]
Expand Down
36 changes: 0 additions & 36 deletions ext/MultiBroadcastFusionCUDAExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,45 +2,9 @@ module MultiBroadcastFusionCUDAExt

import CUDA, Adapt
import MultiBroadcastFusion as MBF
import MultiBroadcastFusion: fused_copyto!

MBF.device(x::CUDA.CuArray) = MBF.MBF_CUDA()

function fused_copyto!(fmb::MBF.FusedMultiBroadcast, ::MBF.MBF_CUDA)
(; pairs) = fmb
dest = first(pairs).first
destinations = map(p -> p.first, pairs)
all(a -> axes(a) == axes(dest), destinations) ||
error("Cannot fuse broadcast expressions with unequal broadcast axes")
nitems = length(parent(dest))
CI = CartesianIndices(axes(dest))
kernel =
CUDA.@cuda always_inline = true launch = false fused_copyto_kernel!(
fmb,
CI,
)
config = CUDA.launch_configuration(kernel.fun)
threads = min(nitems, config.threads)
blocks = cld(nitems, threads)
kernel(fmb, CI; threads, blocks)
return destinations
end
import Base.Broadcast
function fused_copyto_kernel!(fmb::MBF.FusedMultiBroadcast, CI)
@inbounds begin
(; pairs) = fmb
dest = first(pairs).first
nitems = length(dest)
idx =
CUDA.threadIdx().x +
(CUDA.blockIdx().x - Int32(1)) * CUDA.blockDim().x
if 1 ≤ idx ≤ nitems
MBF.rcopyto_at!(pairs, CI[idx])
end
end
return nothing
end

adapt_f(to, f::F) where {F} = Adapt.adapt(to, f)
adapt_f(to, ::Type{F}) where {F} = (x...) -> F(x...)

Expand Down
114 changes: 68 additions & 46 deletions src/execution/fused_kernels.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,20 @@
@make_fused fused_direct FusedMultiBroadcast fused_direct
@make_fused fused_assemble FusedMultiBroadcast fused_assemble

import KernelAbstractions as KA
using KernelAbstractions

# For tests, we can move this out.
struct MBF_CPU end
struct MBF_CUDA end
device(x::AbstractArray) = MBF_CPU()

KA.@kernel function fused_copyto_kernel!(fmb::FusedMultiBroadcast)
(; pairs) = fmb
I = @index(Global, Cartesian)
rcopyto_at!(pairs, I)
end

function Base.copyto!(fmb::FusedMultiBroadcast)
# Since we intercept Base.copyto!, we have not yet
# called Base.Broadcast.instantiate (as this is done
Expand All @@ -18,59 +28,71 @@ function Base.copyto!(fmb::FusedMultiBroadcast)
)
(; pairs) = fmb′ # (Pair(dest1, bc1),Pair(dest2, bc2),...)
dest = first(pairs).first
fused_copyto!(fmb′, device(dest))

assert_sizes(pairs)
# assert_backends(pairs) # perhaps its fine to just compare all `dest` backends?
dest1 = first(pairs).first
backend = KA.get_backend(dest1)
kernel = fused_copyto_kernel!(backend)
kernel(fmb′; ndrange = size(dest1))
end

Base.@propagate_inbounds function rcopyto_at!(
pair::Pair,
i::Vararg{T},
) where {T}
#####
##### rcopyto_at!
#####

Base.@propagate_inbounds function rcopyto_at!(pair::Pair, I)
dest, src = pair.first, pair.second
@inbounds dest[i...] = src[i...]
rcopyto_at!(dest, src, I)
return nothing
end
# Base.@propagate_inbounds function rcopyto_at!(dest, @Const(src), I) # can't use @Const(src) here...
Base.@propagate_inbounds function rcopyto_at!(dest::AbstractVector, src, I)
@inbounds dest[I] = src[I]
return nothing
end
Base.@propagate_inbounds function rcopyto_at!(dest::AbstractArray, src, I)
@inbounds dest[I] = src[I]
return nothing
end
Base.@propagate_inbounds function rcopyto_at!(
pairs::Tuple,
i::Vararg{T},
) where {T}
rcopyto_at!(first(pairs), i...)
rcopyto_at!(Base.tail(pairs), i...)
Base.@propagate_inbounds function rcopyto_at!(pairs::Tuple, I)
rcopyto_at!(first(pairs), I)
rcopyto_at!(Base.tail(pairs), I)
end
Base.@propagate_inbounds rcopyto_at!(
pairs::Tuple{<:Any},
i::Vararg{T},
) where {T} = rcopyto_at!(first(pairs), i...)
@inline rcopyto_at!(pairs::Tuple{}, i::Vararg{T}) where {T} = nothing
Base.@propagate_inbounds rcopyto_at!(pairs::Tuple{<:Any}, I) =
rcopyto_at!(first(pairs), I)
@inline rcopyto_at!(pairs::Tuple{}, I) = nothing

# This is better than the baseline.
function fused_copyto!(fmb::FusedMultiBroadcast, ::MBF_CPU)
(; pairs) = fmb
destinations = map(x -> x.first, pairs)
ei = if eltype(destinations) <: Vector
eachindex(destinations...)
else
eachindex(IndexCartesian(), destinations...)
end
for (dest, bc) in pairs
@inbounds @simd ivdep for i in ei
dest[i] = bc[i]
end
end
return destinations
#####
##### assert_sizes
#####

Base.@propagate_inbounds function assert_sizes(pair::Pair)
dest, src = pair.first, pair.second
@assert size(dest) == size(src)
return nothing
end
Base.@propagate_inbounds function assert_sizes(pairs::Tuple)
assert_sizes(first(pairs))
assert_sizes(Base.tail(pairs))
end
Base.@propagate_inbounds assert_sizes(pairs::Tuple{<:Any}) =
assert_sizes(first(pairs))
@inline assert_sizes(pairs::Tuple{}) = nothing

#####
##### assert_backends
#####

# This should, in theory be better, but it seems like inlining is
# failing somewhere.
# function fused_copyto!(fmb::FusedMultiBroadcast, ::MBF_CPU)
# (; pairs) = fmb
# destinations = map(x -> x.first, pairs)
# ei = if eltype(destinations) <: Vector
# eachindex(destinations...)
# else
# eachindex(IndexCartesian(), destinations...)
# end
# @inbounds @simd ivdep for i in ei
# MBF.rcopyto_at!(pairs, i)
# end
# end
Base.@propagate_inbounds function assert_backends(pair::Pair)
dest, src = pair.first, pair.second
@assert KA.get_backend(dest) == KA.get_backend(src)
return nothing
end
Base.@propagate_inbounds function assert_backends(pairs::Tuple)
assert_backends(first(pairs))
assert_backends(Base.tail(pairs))
end
Base.@propagate_inbounds assert_backends(pairs::Tuple{<:Any}) =
assert_backends(first(pairs))
@inline assert_backends(pairs::Tuple{}) = nothing
2 changes: 2 additions & 0 deletions test/execution/runtests.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#=
using Revise; include(joinpath("test", "execution", "runtests.jl"))
=#
using Test
using SafeTestsets

#! format: off
@safetestset "fused_shared_reads" begin; @time include("bm_fused_shared_reads.jl"); end
Expand Down
3 changes: 1 addition & 2 deletions test/runtests.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#=
julia --project
using TestEnv
TestEnv.activate()
using TestEnv; TestEnv.activate()
using CUDA;
ENV["PERFORM_BENCHMARK"]="true";

Expand Down
Loading