Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Native GPU support #65

Open
wants to merge 52 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
52 commits
Select commit Hold shift + click to select a range
9b8872c
Initialize CUDA extension
MilesCranmer Jan 14, 2024
666708e
Fix output
MilesCranmer Jan 14, 2024
4484d18
Switch to GPUArrays rather than CUDA
MilesCranmer Jan 14, 2024
644c3b8
Switch to GPUArrays
MilesCranmer Jan 14, 2024
0bfd8ee
Add file for converting to arrays
MilesCranmer Jan 15, 2024
5c90896
Add GPU kernel for evaluation
MilesCranmer Jan 15, 2024
57fff36
Allow submitting multiple trees at once
MilesCranmer Jan 15, 2024
81529af
Working GPU kernel for expression evaluation!
MilesCranmer Jan 15, 2024
07dec6b
Formatting
MilesCranmer Jan 15, 2024
c19f37f
Remove GPUArrays extension as redundant
MilesCranmer Jan 15, 2024
ade1613
Remove unneccessary sync call
MilesCranmer Jan 15, 2024
53b42ce
Reduce number of allocations
MilesCranmer Jan 15, 2024
cdfcec2
Greatly improve allocations
MilesCranmer Jan 15, 2024
20a8f38
More memory allocation improvements
MilesCranmer Jan 15, 2024
6bbff73
Allow user to pass preallocated buffer
MilesCranmer Jan 15, 2024
03d2d24
Try to fix unbound type parameter
MilesCranmer Jan 15, 2024
2e128bb
Expand CUDA compat to 4
MilesCranmer Jan 15, 2024
da31b87
Clean up formatting
MilesCranmer Jan 16, 2024
4fdc025
Reduce allocations in `as_array`
MilesCranmer Jan 16, 2024
c64a6cb
Fix indexing
MilesCranmer Jan 16, 2024
88ee6cf
Compatible CUDA kernel with empty operators
MilesCranmer Jan 16, 2024
f1f2221
Merge branch 'bump-alloc3' into cuda-5
MilesCranmer Jan 28, 2024
243bdda
Formatting
MilesCranmer Jan 28, 2024
6b0337c
Merge branch 'master' into cuda-5
MilesCranmer Feb 3, 2024
b1aca68
Ensure we always check root during array construction
MilesCranmer Feb 25, 2024
9372150
Mechanism to unittest cuda kernel
MilesCranmer Feb 25, 2024
5241645
Merge branch 'master' into cuda-5
MilesCranmer Feb 25, 2024
5befede
Fix unbound type issues
MilesCranmer Feb 25, 2024
74094e3
Start working on CPU-based unittests
MilesCranmer Feb 25, 2024
9d7500c
Fix indexing bugs in GPU kernels
MilesCranmer Feb 25, 2024
f6155ba
Fix unsafe `sin` in CUDA kernel tests
MilesCranmer Feb 25, 2024
a7d7e79
Fix `MaybeCuArray` definition
MilesCranmer Feb 25, 2024
46a6b48
Test buffered-evaluation of CUDA kernels
MilesCranmer Feb 25, 2024
7ceacc0
Only test CUDA on 1.9+
MilesCranmer Feb 25, 2024
08df9a3
Use vectors for vector input; tuple for tuple
MilesCranmer Feb 25, 2024
8b4cbf3
Fix unbound args in multi-tree eval
MilesCranmer Feb 25, 2024
676ad86
Add test for no buffer recreation
MilesCranmer Feb 26, 2024
c1e579a
Start switching to KernelAbstractions.jl
MilesCranmer Feb 26, 2024
cb2d055
Revert "Start switching to KernelAbstractions.jl"
MilesCranmer Feb 26, 2024
db94de2
Merge branch 'master' into cuda-5
MilesCranmer Jul 17, 2024
bfa9148
fix: as array imports
MilesCranmer Jul 17, 2024
9f49619
hack: try implementing cooperative group
MilesCranmer Jul 17, 2024
1422e36
refactor: clean up extensions
MilesCranmer Nov 29, 2024
6c01ded
Merge tag 'v1.8.0' into cuda-5
MilesCranmer Dec 15, 2024
74afd72
Revert "hack: try implementing cooperative group"
MilesCranmer Dec 15, 2024
38c589b
refactor: clean up `map` operations
MilesCranmer Dec 15, 2024
4cce551
fix: stabilize CUDA ext
MilesCranmer Dec 15, 2024
b7ecdd3
test: modularize CUDA test
MilesCranmer Dec 15, 2024
8fe652c
fix: potential type instability in as_array
MilesCranmer Dec 15, 2024
1d06bd0
refactor: as array into modular functions
MilesCranmer Dec 16, 2024
74f56d4
refactor: ensure `@inbounds` for gpu kernel
MilesCranmer Dec 16, 2024
3da1d38
refactor: avoid SubArray in CUDA kernel
MilesCranmer Dec 16, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ version = "1.8.0"

[deps]
ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
Compat = "34da2185-b29b-5c13-b0c7-acf172513d20"
DispatchDoctor = "8d63f2c5-f18a-4cf2-ba9d-b3f60fc568c8"
Interfaces = "85a1e053-f937-4924-92a5-1367d23b7b87"
MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
Expand All @@ -15,21 +16,25 @@ TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76"

[weakdeps]
Bumper = "8ce10254-0962-460f-a3d8-1f77fea1446e"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
Optim = "429524aa-4258-5aef-a3af-852621145aeb"
SymbolicUtils = "d1185830-fcd6-423d-90d6-eec64667417b"
Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"

[extensions]
DynamicExpressionsBumperExt = "Bumper"
DynamicExpressionsCUDAExt = "CUDA"
DynamicExpressionsLoopVectorizationExt = "LoopVectorization"
DynamicExpressionsOptimExt = "Optim"
DynamicExpressionsSymbolicUtilsExt = "SymbolicUtils"
DynamicExpressionsZygoteExt = "Zygote"

[compat]
Bumper = "0.6"
CUDA = "4, 5"
ChainRulesCore = "1"
Compat = "4.16"
DispatchDoctor = "0.4"
Interfaces = "0.3"
LoopVectorization = "0.12"
Expand All @@ -43,6 +48,7 @@ julia = "1.10"

[extras]
Bumper = "8ce10254-0962-460f-a3d8-1f77fea1446e"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
Optim = "429524aa-4258-5aef-a3af-852621145aeb"
SymbolicUtils = "d1185830-fcd6-423d-90d6-eec64667417b"
Expand Down
215 changes: 215 additions & 0 deletions ext/DynamicExpressionsCUDAExt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
module DynamicExpressionsCUDAExt

# TODO: Switch to KernelAbstractions.jl (once they hit v1.0)
using CUDA: @cuda, CuArray, blockDim, blockIdx, threadIdx
using DynamicExpressions: OperatorEnum, AbstractExpressionNode
using DynamicExpressions.EvaluateModule: get_nbin, get_nuna
using DynamicExpressions.AsArrayModule:
as_array,
IDX_DEGREE,
IDX_FEATURE,
IDX_OP,
IDX_EXECUTION_ORDER,
IDX_SELF,
IDX_L,
IDX_R,
IDX_CONSTANT
using DispatchDoctor: @stable

import DynamicExpressions.EvaluateModule: eval_tree_array

# array type for exclusively testing purposes
struct FakeCuArray{T,N,A<:AbstractArray{T,N}} <: AbstractArray{T,N}
a::A
end
Base.similar(x::FakeCuArray, dims::Integer...) = FakeCuArray(similar(x.a, dims...))
Base.getindex(x::FakeCuArray, i::Int...) = getindex(x.a, i...)
Base.setindex!(x::FakeCuArray, v, i::Int...) = setindex!(x.a, v, i...)
Base.size(x::FakeCuArray) = size(x.a)

const MaybeCuArray{T,N} = Union{CuArray{T,N},FakeCuArray{T,N}}

@stable default_mode = "disable" begin
to_device(a, ::CuArray) = CuArray(a)
to_device(a, ::FakeCuArray) = FakeCuArray(a)
end

@stable default_mode = "disable" function eval_tree_array(
tree::AbstractExpressionNode{T}, gcX::MaybeCuArray{T,2}, operators::OperatorEnum; kws...
) where {T<:Number}
(outs, is_good) = eval_tree_array((tree,), gcX, operators; kws...)
return (only(outs), only(is_good))
end

@stable default_mode = "disable" function eval_tree_array(
trees::Union{Tuple{N,Vararg{N}},AbstractVector{N}},
gcX::MaybeCuArray{T,2},
operators::OperatorEnum;
buffer=nothing,
gpu_workspace=nothing,
gpu_buffer=nothing,
roots=nothing,
num_nodes=nothing,
num_launches=nothing,
update_buffers::Val{_update_buffers}=Val(true),
kws...,
) where {T<:Number,N<:AbstractExpressionNode{T},_update_buffers}
local val
if _update_buffers
(; val, roots, buffer, num_nodes, num_launches) = as_array(Int32, trees; buffer)
end
# TODO: Fix this type instability?
num_elem = size(gcX, 2)

num_launches = num_launches isa Integer ? num_launches : num_launches[]

## The following array is our "workspace" for
## the GPU kernel, with size equal to the number of rows
## in the input data by the number of nodes in the tree.
## It has one extra row to store the constant values.
gworkspace = @something(gpu_workspace, similar(gcX, num_elem + 1, num_nodes))
if _update_buffers
copyto!(@view(gworkspace[end, :]), val)
end
val_idx = size(gworkspace, 1)

gbuffer = if !_update_buffers
gpu_buffer
elseif gpu_buffer === nothing
to_device(buffer, gcX)
else
copyto!(gpu_buffer, buffer)
end

# Removed @view definitions of gdegree, gfeature, etc.
# We'll index directly into gbuffer using the constants above.

num_threads = 256
num_blocks = nextpow(2, ceil(Int, num_elem * num_nodes / num_threads))

#! format: off
_launch_gpu_kernel!(
num_threads, num_blocks, num_launches, gworkspace,
# Thread info:
num_elem, num_nodes,
# We'll pass gbuffer directly to the kernel now:
operators, gcX, gbuffer, val_idx,
)
#! format: on

out = map(r -> @view(gworkspace[begin:(end - 1), r]), roots)
is_good = map(Returns(true), trees)

return (out, is_good)
end

#! format: off
@stable default_mode = "disable" function _launch_gpu_kernel!(
num_threads, num_blocks, num_launches::Integer, buffer::AbstractArray{T,2},
# Thread info:
num_elem::Integer, num_nodes::Integer,
operators::OperatorEnum, cX::AbstractArray{T,2}, gbuffer::AbstractArray{Int32,2},
val_idx::Integer
) where {T}
#! format: on
nuna = get_nuna(typeof(operators))
nbin = get_nbin(typeof(operators))
(nuna > 10 || nbin > 10) &&
error("Too many operators. Kernels are only compiled up to 10.")
gpu_kernel! = create_gpu_kernel(operators, Val(nuna), Val(nbin))
for launch in one(Int32):Int32(num_launches)
#! format: off
if buffer isa CuArray
@cuda threads=num_threads blocks=num_blocks gpu_kernel!(
buffer,
launch, num_elem, num_nodes,
cX, gbuffer, val_idx
)
else
Threads.@threads for i in 1:(num_threads * num_blocks)
gpu_kernel!(
buffer,
launch, num_elem, num_nodes,
cX, gbuffer, val_idx, i
)
end
end
#! format: on
end
return nothing
end

# Need to pre-compute the GPU kernels with an `@eval` for each number of operators
# 1. We need to use an `@nif` over operators, as GPU kernels
# can't index into arrays of operators.
# 2. `@nif` is evaluated at parse time and needs to know the number of
# ifs to generate at that time, so we can't simply use specialization.
# 3. We can't use `@generated` because we can't create closures in those.
for nuna in 0:10, nbin in 0:10
@eval function create_gpu_kernel(operators::OperatorEnum, ::Val{$nuna}, ::Val{$nbin})
#! format: off
function (
buffer,
launch::Integer, num_elem::Integer, num_nodes::Integer,
cX::AbstractArray, gbuffer::AbstractArray{Int32,2},
val_idx::Integer,
i=nothing,
)
i = @something(i, (blockIdx().x - 1) * blockDim().x + threadIdx().x)
if i > num_elem * num_nodes
return nothing
end

node = (i - 1) % num_nodes + 1
elem = (i - node) ÷ num_nodes + 1


@inbounds begin
if gbuffer[IDX_EXECUTION_ORDER, node] != launch
return nothing
end

# Use constants to index gbuffer:
cur_degree = gbuffer[IDX_DEGREE, node]
cur_idx = gbuffer[IDX_SELF, node]

if cur_degree == 0
if gbuffer[IDX_CONSTANT, node] == 1
cur_val = buffer[val_idx, node]
buffer[elem, cur_idx] = cur_val
else
cur_feature = gbuffer[IDX_FEATURE, node]
buffer[elem, cur_idx] = cX[cur_feature, elem]
end
else
if cur_degree == 1 && $nuna > 0
cur_op = gbuffer[IDX_OP, node]
l_idx = gbuffer[IDX_L, node]
Base.Cartesian.@nif(
$nuna,
i -> i == cur_op,
i -> let op = operators.unaops[i]
buffer[elem, cur_idx] = op(buffer[elem, l_idx])
end
)
elseif $nbin > 0
cur_op = gbuffer[IDX_OP, node]
l_idx = gbuffer[IDX_L, node]
r_idx = gbuffer[IDX_R, node]
Base.Cartesian.@nif(
$nbin,
i -> i == cur_op,
i -> let op = operators.binops[i]
buffer[elem, cur_idx] = op(buffer[elem, l_idx], buffer[elem, r_idx])
end
)
end
end
end
#! format: on
return nothing
end
end
end

end
Loading
Loading