Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial Work on GPU support. #194

Merged
merged 17 commits into from
Jun 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
julia-version: ['1.7','1.8']
julia-version: ['1.9','1.10']
julia-arch: [x64]
os: [ubuntu-latest, windows-latest, macOS-latest]

Expand Down
13 changes: 9 additions & 4 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@ DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
Dolang = "e5c7262c-e9d2-5620-ad8e-1af14eb8a8e3"
Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
FiniteDiff = "6a86dc24-6348-571c-b903-95158fe2bd41"
FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
Formatting = "59287772-0a20-5a39-b81b-1366585eb4c0"
ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
Expand All @@ -27,20 +27,25 @@ LabelledArrays = "2ee39098-c373-598a-b85f-a56591580800"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
LinearMaps = "7a12625a-238d-50fd-b39a-03d52299707e"
LinearOperators = "5c8ed15e-5a4c-59e4-a42b-c7e8811fb125"
LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
NLsolve = "2774e3e8-f4cf-5e23-947b-6d7e65073b56"
Optim = "429524aa-4258-5aef-a3af-852621145aeb"
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
ProgressLogging = "33c8b6b6-d38a-422a-b730-caa89a2f386c"
RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01"
ResumableFunctions = "c5292f4c-5179-55e1-98c5-05642aab7184"
ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
Roots = "f2b01f46-fcfa-551c-844a-d8ac1e96c665"
SIMD = "fdea26ae-647d-5447-a871-4b548cad5224"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
StringDistances = "88034a9c-02f8-509d-84a9-84ec65e18404"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6"
oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"

[extensions]
DolooneAPIExt = "oneAPI"
DoloCUDAExt = "CUDA"

[weakdeps]
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
16 changes: 8 additions & 8 deletions examples/ymodels/rbc_mc.jl
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@

using StaticArrays
import Dolo: transition, arbitrage
import Dolo: transition, arbitrage, intermediate, hypeof

model = let

name = :rbc_mc

# calibrate some parameters
β = 0.99
σ = 5
η = 1
σ = 5.0
η = 1.0
δ = 0.025
α = 0.33
ρ = 0.8
Expand Down Expand Up @@ -55,7 +55,8 @@ model = let

end

function Dolo.transition(model::typeof(model), s::NamedTuple, x::NamedTuple, M::NamedTuple)

function Dolo.transition(model::hypeof(model), s::NamedTuple, x::NamedTuple, M::NamedTuple)

(;δ, ρ) = model.calibration

Expand All @@ -68,7 +69,7 @@ end



function intermediate(model::typeof(model),s::NamedTuple, x::NamedTuple)
function intermediate(model::hypeof(model), s::NamedTuple, x::NamedTuple)

p = model.calibration

Expand All @@ -81,19 +82,18 @@ function intermediate(model::typeof(model),s::NamedTuple, x::NamedTuple)
end


function arbitrage(model::typeof(model), s::NamedTuple, x::NamedTuple, S::NamedTuple, X::NamedTuple)
function arbitrage(model::hypeof(model), s::NamedTuple, x::NamedTuple, S::NamedTuple, X::NamedTuple)

p = model.calibration

y = intermediate(model, s, x)
Y = intermediate(model, S, X)

res_1 = p.χ*(x.n^p.η)*(y.c^p.σ) - y.w
res_2 = (p.β*(y.c/Y.c)^p.σ)*(1 - p.δ + Y.rk) - 1

return ( (;res_1, res_2) )

end



model
12 changes: 12 additions & 0 deletions ext/DoloCUDAExt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
module DoloCUDAExt

using CUDA
using Dolo
# should it be merged with the general definition?

import CUDA: CuArray

Dolo.distance(x::GVector{G, A}, y::GVector{G,A}) where G where A<:CuArray = Base.mapreduce(u->maximum(abs.(u)), max, x.data-y.data)


end
16 changes: 16 additions & 0 deletions ext/DolooneAPIExt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
module DolooneAPIExt

println("Loading oneAPI")
using oneAPI
import oneAPI: oneArray

using Dolo: distance

Dolo.distance(x::GVector{G, A}, y::GVector{G,A}) where G where A<:oneArray = Base.mapreduce(u->maximum(abs.(u)), max, x.data-y.data)

import Dolo.splines: prefilter!

prefilter!(data::oneArray) = prefilter!(data, Val(:KA))


end
92 changes: 92 additions & 0 deletions misc/dev_conversion.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
using Dolo
using Dolo: transition
function hypeof(model)
typ = typeof(model)
otyp = eltype(model)
ntyp = Float32
s = replace(string(typ),string(otyp)=>"Float32")
gtyp = ( Meta.parse(s) )
return Union{eval(gtyp), typ}
end

# hypeof(model)
root_dir = pkgdir(Dolo)

# macro mdef(funtree)
# println(funtree)
# println(dump(funtree))
# println("\nDONE\n")
# return funtree
# end

model = include("$(root_dir)/examples/ymodels/rbc_mc.jl")


# macro mdef(funtree)
# return funtree
# end

# @mdef function fun(a)
# a = (;x=1,y=2)
# (;x, y) = a
# return (x*y)
# end



model32 = Dolo.convert_precision(Float32, model)



s_ = Dolo.calibrated(model32, :states)
x = Dolo.calibrated(model32, :controls)
m_ = Dolo.calibrated(model32, :exogenous)

s = [m_;s_]

dm32 = Dolo.discretize(model32)
Dolo.time_iteration(dm32, verbose=true)

Dolo.time_iteration(model32)

Dolo.transition(model32, s,x,m_)



function Dolo.transition(model::typeof(model), s::NamedTuple, x::NamedTuple, M::NamedTuple)

(;δ, ρ) = model.calibration

# Z = e.Z
K = s.k * (1-δ) + x.i

(;k=K,) ## This is only the endogenous state

end


model_32 = Dolo.convert_precision(Float32, model)
# inherit all methods
transition(m::typeof(model_32), args...) = transition(model, args...)
arbitrage(m::typeof(model_32), args...) = arbitrage(model, args...)

Dolo.time_iteration(model_32)


dm32 = Dolo.discretize(model_32, Dict(:endo=>[1000]) )

typeof(dm32)

using Adapt
import oneAPI: oneArray

gpuArray = oneArray
interp_mode = :linear

wk0 = Dolo.time_iteration_workspace(dm32; interp_mode=interp_mode);
wk1 = Dolo.time_iteration_workspace(dm32; interp_mode=interp_mode);
wk_gpu = Dolo.time_iteration_workspace(dm32, dest=gpuArray; interp_mode=interp_mode);

@time sol1 = time_iteration(dm32, wk0; engine=:nothing, tol_η=1e-5, verbose=true, improve_wait=0, improve=false);
@time sol3 = time_iteration(dm32, wk1; engine=:cpu, tol_η=1e-5, verbose=true, improve_wait=0, improve=false);
@time sol2 = time_iteration(dm32, wk_gpu; engine=:gpu, tol_η=1e-5, verbose=true, improve_wait=0, improve=false);
121 changes: 121 additions & 0 deletions misc/dev_float32.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
using Dolo

root_dir = pkgdir(Dolo)
model_32 = include("$(root_dir)/misc/rbc_float32.jl")


model32 = Dolo.convert_precision(Float32, model_32)

dm32 = Dolo.discretize(model32, Dict(:endo=>[100000]) )

typeof(dm32)


using Adapt
# import oneAPI: oneArray
import CUDA: CuArray
# import Cu
import Adapt: adapt_structure

import Dolo
gpuArray = CuArray
interp_mode = :linear

wk0 = Dolo.time_iteration_workspace(dm32; interp_mode=interp_mode);
wk1 = Dolo.time_iteration_workspace(dm32; interp_mode=interp_mode);
wk_gpu = Dolo.time_iteration_workspace(dm32, dest=gpuArray; interp_mode=interp_mode);

@time sol1 = time_iteration(dm32, wk0; engine=:nothing, tol_η=1e-5, verbose=true, improve_wait=0, improve=false);
@time sol3 = time_iteration(dm32, wk1; engine=:cpu, tol_η=1e-5, verbose=true, improve_wait=0, improve=false);
@time sol2 = time_iteration(dm32, wk_gpu; engine=:gpu, tol_η=1e-5, verbose=true, improve_wait=0, improve=false);




wk0 = Dolo.time_iteration_workspace(dm32; interp_mode=:cubic);
wk1 = Dolo.time_iteration_workspace(dm32; interp_mode=:cubic);
wk_gpu = Dolo.time_iteration_workspace(dm32, dest=gpuArray; interp_mode=:cubic);

@time sol1 = time_iteration(dm32, wk0; engine=:nothing, tol_η=1e-5, verbose=true, improve_wait=10, improve_K=100,improve=true);
@time sol3 = time_iteration(dm32, wk1; engine=:cpu, tol_η=1e-5, verbose=true, improve_wait=10, improve_K=100, improve=true);
# that one stops early
@time sol2 = time_iteration(dm32, wk_gpu; engine=:gpu, tol_η=1e-5, verbose=true, improve_wait=10, improve_K=100, improve=true);





# list available options

# @time wk = Dolo.time_iteration_workspace(dm32, dest=Array; interp_mode=:cubic);

# @time Dolo.splines.prefilter!(c0, Val(:CPU))
# @time Dolo.splines.prefilter!(c0, Val(:Threads))
# @time Dolo.splines.prefilter!(c0, Val(:KA))

# @time Dolo.splines.prefilter!(c0)
# @time Dolo.splines.prefilter!(c1,Val(:KA))

# @time Dolo.splines.prefilter!(c,Val(:KA))

# c_gpu = adapt(Array, c)

# # Dolo.splines.prefilter!(c)



# t_e = get_backend(wk.x0)

# using KernelAbstractions: CPU
# using StaticArrays


# L0 = Dolo.dF_2(dm32, wk0.x0*0, wk0.φ)
# L0_gpu = adapt(oneArray, L0)

# L = deepcopy(L0)

# @time Dolo.dF_2!(L, dm32, wk0.x0, wk0.φ, CPU())

# # @time Dolo.dd_2!(L, dm32, wk0.x0, wk0.φ, CPU())


# @time Dolo.dF_2!(L0_gpu, dm32, wk.x0, wk.φ, t_e)

# Lg = adapt(Array, L0_gpu)


# @time L*wk0.x0;

# #It works !
# using BenchmarkTools
# @benchmark Dolo.mul!(wk0.r0, L, wk0.x0, CPU())
# @benchmark Dolo.mul!(wk.r0, L0_gpu, wk.x0, t_e)
# # try

# # res = fun_(wk.r0, dm32, wk.x0, wk.φ; ndrange=(p,q))
# # catch err
# # nothing
# # end

# wk0 = Dolo.time_iteration_workspace(dm32; interp_mode=:cubic)
# # wk = adapt(CuArray, wk0)
# wk = adapt(oneArray, wk0)

# @time Dolo.time_iteration(dm32, wk, verbose=true, improve=true; engine=:gpu, improve_wait=10, improve_K=500, T=20);




# @time Dolo.time_iteration(dm32, wk0, verbose=true, improve=false; T=20);




# using ForwardDiff

# ForwardDiff.derivative(u->unsafe_trunc(u), 0.1)

# ForwardDiff.jacobian(u->trunc.(Int, u), SVector(0.3, 0.2))

# ForwardDiff.jacobian(u->unsafe_trunc.(Int, u), SVector(0.3, 0.2))
Loading
Loading