Skip to content

Commit

Permalink
GPUfied splines 1/2
Browse files Browse the repository at this point in the history
  • Loading branch information
albop committed May 28, 2024
1 parent f2f6754 commit f228dec
Show file tree
Hide file tree
Showing 11 changed files with 260 additions and 143 deletions.
3 changes: 1 addition & 2 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ BlockDiagonals = "0a1fb500-61f7-11e9-3c65-f5ef3456f9f0"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
Continuables = "79afa230-ca09-11e8-120b-5decf7bf5e25"
Crayons = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
Cthulhu = "f68482b8-f384-11e8-15f7-abe071a5a75f"
DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
Expand All @@ -30,7 +29,6 @@ LabelledArrays = "2ee39098-c373-598a-b85f-a56591580800"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
LinearMaps = "7a12625a-238d-50fd-b39a-03d52299707e"
LinearOperators = "5c8ed15e-5a4c-59e4-a42b-c7e8811fb125"
LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
NLsolve = "2774e3e8-f4cf-5e23-947b-6d7e65073b56"
Optim = "429524aa-4258-5aef-a3af-852621145aeb"
Expand All @@ -47,3 +45,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6"
Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"

3 changes: 2 additions & 1 deletion examples/ymodels/rbc_mc.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@

using StaticArrays
import Dolo: transition, arbitrage
import Dolo: transition, arbitrage, intermediate

model = let

Expand Down Expand Up @@ -87,6 +87,7 @@ function arbitrage(model::typeof(model), s::NamedTuple, x::NamedTuple, S::NamedT

y = intermediate(model, s, x)
Y = intermediate(model, S, X)

res_1 = p.χ*(x.n^p.η)*(y.c^p.σ) - y.w
res_2 = (p.β*(y.c/Y.c)^p.σ)*(1 - p.δ + Y.rk) - 1

Expand Down
71 changes: 49 additions & 22 deletions misc/dev_float32.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,55 +6,82 @@ model_32 = include("$(root_dir)/misc/rbc_float32.jl")

model32 = Dolo.convert_precision(Float32, model_32)

dm32 = Dolo.discretize(model32, Dict(:endo=>[100000]) )

dm32 = Dolo.discretize(model32, Dict(:endo=>[1000]) )

typeof(dm32)

using Adapt

using Adapt
import oneAPI: oneArray
import Adapt: adapt_structure
import CUDA: CuArray
# import CUDA: CuArray


wk0 = Dolo.time_iteration_workspace(dm32)
wk0 = Dolo.time_iteration_workspace(dm32);

wk = Dolo.time_iteration_workspace(dm32, dest=CuArray)
# wk = Dolo.time_iteration_workspace(dm32, dest=CuArray)
wk = Dolo.time_iteration_workspace(dm32, dest=oneArray);


wk = adapt(oneArray, wk0)
# wk = adapt(CuArray, wk0)

# wk = adapt(oneArray, wk0)

wk = adapt(CuArray, wk0)
using KernelAbstractions: get_backend

if typeof(dm32.grid)<:Dolo.CGrid
p = dm32.grid.ranges[1][3]
q = dm32.grid.ranges[2][3]
else
p = length(dm32.grid.grids[1])
q = length(dm32.grid.grids[2])
end

t_e = get_backend(wk.x0)

using KernelAbstractions
using KernelAbstractions: CPU
using StaticArrays


L0 = Dolo.dF_2(dm32, wk0.x0*0, wk0.φ)
L0_gpu = adapt(oneArray, L0)

L = deepcopy(L0)

@time Dolo.dF_2!(L, dm32, wk0.x0, wk0.φ, CPU())

# @time Dolo.dd_2!(L, dm32, wk0.x0, wk0.φ, CPU())


@time Dolo.dF_2!(L0_gpu, dm32, wk.x0, wk.φ, t_e)

Lg = adapt(Array, L0_gpu)


@time L*wk0.x0;

#It works !
using BenchmarkTools
@benchmark Dolo.mul!(wk0.r0, L, wk0.x0, CPU())
@benchmark Dolo.mul!(wk.r0, L0_gpu, wk.x0, t_e)
# try

# res = fun_(wk.r0, dm32, wk.x0, wk.φ; ndrange=(p,q))
# catch err
# nothing
# end

wk0 = Dolo.time_iteration_workspace(dm32)
wk = adapt(CuArray, wk0)
@time Dolo.time_iteration(dm32, wk, verbose=false; engine=:gpu);
wk0 = Dolo.time_iteration_workspace(dm32; interp_mode=:cubic)
# wk = adapt(CuArray, wk0)
wk = adapt(oneArray, wk0)

@time Dolo.time_iteration(dm32, wk, verbose=true, improve=true; engine=:gpu, improve_wait=10, improve_K=500, T=20);




@time Dolo.time_iteration(dm32, wk0, verbose=true, improve=false; T=20);




@time Dolo.time_iteration(dm32, wk0, verbose=false; engine=:gpu);
using ForwardDiff

ForwardDiff.derivative(u->unsafe_trunc(u), 0.1)

Dolo.distance(adapt(Array,wk.x0) , wk0.x0)
ForwardDiff.jacobian(u->trunc.(Int, u), SVector(0.3, 0.2))

# 357 times faster !
ForwardDiff.jacobian(u->unsafe_trunc.(Int, u), SVector(0.3, 0.2))
13 changes: 9 additions & 4 deletions src/adapt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,14 @@ function adapt_structure(to, f::DFun{Dom, Gar, Itp, vars}) where Dom where Gar w
)
end

import KernelAbstractions: get_backend
function adapt_structure(to, L::LL{G,D,F}) where G where D where F
LL(L.grid, adapt(to, L.D), adapt(to, L.φ))
end


get_backend(g::GArray) = get_backend(g.data)
# should it be merged with the general definition?
# import CUDA: CuArray
# distance(x::GVector{G, A}, y::GVector{G,A}) where G where A<:CuArray = Base.mapreduce(u->maximum(u), max, x.data-y.data)

import CUDA: CuArray
distance(x::GVector{G, A}, y::GVector{G,A}) where G where A<:CuArray = Base.mapreduce(u->maximum(u), max, x.data-y.data)
import oneAPI: oneArray
distance(x::GVector{G, A}, y::GVector{G,A}) where G where A<:oneArray = Base.mapreduce(u->maximum(u), max, x.data-y.data)
36 changes: 23 additions & 13 deletions src/algos/time_iteration.jl
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ function F(dmodel::M, s::QP, x::SVector{d,T}, φ::Union{Policy, GArray, DFun}) w
# )
r
# r::SVector{d,T}
r = complementarities(dmodel.model, s,x,r)
# r = complementarities(dmodel.model, s,x,r)
r
end

Expand Down Expand Up @@ -128,7 +128,7 @@ end
using LinearMaps
using Adapt

function time_iteration_workspace(dmodel; interp_mode=:linear, dest=Array)
function time_iteration_workspace(dmodel; interp_mode=:linear, improve=false, dest=Array)

T = eltype(dmodel)

Expand All @@ -146,13 +146,14 @@ function time_iteration_workspace(dmodel; interp_mode=:linear, dest=Array)
vars = variables(dmodel.model.controls)
φ = DFun(dmodel.model.states, x0, vars; interp_mode=interp_mode)

tt = (;x0, x1, x2, r0, dx, J, φ)
# if improve
L = Dolo.dF_2(dmodel, x1, φ)
tt = (;x0, x1, x2, r0, dx, J, L, φ)
# else
# tt = (;x0, x1, x2, r0, dx, J, φ)
# end

res = NamedTuple(
( (k=>adapt(dest, v)) for (k,v) in zip(keys(tt),values(tt)) )
)
return res
# return adapt(dest, tt)
return adapt(dest, tt)

end

Expand All @@ -174,6 +175,7 @@ function time_iteration(model::YModel; kwargs...)
time_iteration(dmodel, wksp; kwargs2...)
end

using oneAPI: oneArray
function time_iteration(model::DYModel,
workspace=time_iteration_workspace(model);
T=500,
Expand All @@ -199,14 +201,18 @@ function time_iteration(model::DYModel,

# mem = typeof(workspace) <: Nothing ? time_iteration_workspace(model) : workspace
mbsteps = 5
lam = 0.5

(;x0, x1, x2, dx, r0, J, φ) = workspace


local η_0 = NaN
local η
convergence = false
iterations = T

(;x0, x1, x2, dx, r0, J, φ) = workspace
Tf = eltype(eltype(x0))

lam = convert(Tf, 0.5)

if engine in (:cpu, :gpu)
t_engine = get_backend(workspace.x0)
Expand All @@ -217,7 +223,7 @@ function time_iteration(model::DYModel,
ti_trace = trace ? IterationTrace(typeof(φ)[]) : nothing

if improve
J_2 = Dolo.dF_2(model, x1, φ)
J_2 = workspace.L
end

for t=1:T
Expand Down Expand Up @@ -306,12 +312,16 @@ function time_iteration(model::DYModel,
Dolo.dF_1!(J_1, model, x1, φ, t_engine)
Dolo.dF_2!(J_2, model, x1, φ, t_engine)

mul!(J_2, -1.0)
mul!(J_2, convert(Tf,-1.0))

# J_2.M_ij[:] *= -1.0
Tp = J_1 \ J_2

d = (x1-x0)
x0.data .+= neumann(Tp, d; K=improve_K)

rhs = neumann(Tp, d; K=improve_K)

x0.data .+= rhs.data

else
x0.data .= x1.data
Expand Down
Loading

0 comments on commit f228dec

Please sign in to comment.