GPUfied splines 1/2

EconForge · May 28, 2024 · f228dec · f228dec
1 parent f2f6754
commit f228dec
Show file tree

Hide file tree

Showing 11 changed files with 260 additions and 143 deletions.
diff --git a/Project.toml b/Project.toml
@@ -11,7 +11,6 @@ BlockDiagonals = "0a1fb500-61f7-11e9-3c65-f5ef3456f9f0"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Continuables = "79afa230-ca09-11e8-120b-5decf7bf5e25"
 Crayons = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
-Cthulhu = "f68482b8-f384-11e8-15f7-abe071a5a75f"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
@@ -30,7 +29,6 @@ LabelledArrays = "2ee39098-c373-598a-b85f-a56591580800"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LinearMaps = "7a12625a-238d-50fd-b39a-03d52299707e"
 LinearOperators = "5c8ed15e-5a4c-59e4-a42b-c7e8811fb125"
-LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 NLsolve = "2774e3e8-f4cf-5e23-947b-6d7e65073b56"
 Optim = "429524aa-4258-5aef-a3af-852621145aeb"
@@ -47,3 +45,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
+
diff --git a/examples/ymodels/rbc_mc.jl b/examples/ymodels/rbc_mc.jl
@@ -1,6 +1,6 @@
 
 using StaticArrays
-import Dolo: transition, arbitrage
+import Dolo: transition, arbitrage, intermediate
 
 model = let 
 
@@ -87,6 +87,7 @@ function arbitrage(model::typeof(model), s::NamedTuple, x::NamedTuple, S::NamedT
 
 	y = intermediate(model, s, x)
 	Y = intermediate(model, S, X)
+
 	res_1 = p.χ*(x.n^p.η)*(y.c^p.σ) - y.w
 	res_2 = (p.β*(y.c/Y.c)^p.σ)*(1 - p.δ + Y.rk) - 1
 

diff --git a/misc/dev_float32.jl b/misc/dev_float32.jl
@@ -6,55 +6,82 @@ model_32 = include("$(root_dir)/misc/rbc_float32.jl")
 
 model32 = Dolo.convert_precision(Float32, model_32)
 
-dm32 = Dolo.discretize(model32, Dict(:endo=>[100000]) )
-
+dm32 = Dolo.discretize(model32, Dict(:endo=>[1000]) )
 
 typeof(dm32)
 
-using Adapt
 
+using Adapt
 import oneAPI: oneArray
 import Adapt: adapt_structure
-import CUDA: CuArray
+# import CUDA: CuArray
 
 
-wk0 = Dolo.time_iteration_workspace(dm32)
+wk0 = Dolo.time_iteration_workspace(dm32);
 
-wk = Dolo.time_iteration_workspace(dm32, dest=CuArray)
+# wk = Dolo.time_iteration_workspace(dm32, dest=CuArray)
+wk = Dolo.time_iteration_workspace(dm32, dest=oneArray);
+
+
+wk = adapt(oneArray, wk0)
+# wk = adapt(CuArray, wk0)
 
-# wk = adapt(oneArray, wk0)
 
-wk = adapt(CuArray, wk0)
 using KernelAbstractions: get_backend
 
-if typeof(dm32.grid)<:Dolo.CGrid
-    p = dm32.grid.ranges[1][3]
-    q = dm32.grid.ranges[2][3]
-else
-    p = length(dm32.grid.grids[1])
-    q = length(dm32.grid.grids[2])
-end
 
 t_e = get_backend(wk.x0)
 
-using KernelAbstractions
+using KernelAbstractions: CPU
 using StaticArrays
 
 
+L0 = Dolo.dF_2(dm32, wk0.x0*0, wk0.φ)
+L0_gpu = adapt(oneArray, L0)
+
+L = deepcopy(L0)
+
+@time Dolo.dF_2!(L, dm32, wk0.x0, wk0.φ, CPU())
+
+# @time Dolo.dd_2!(L, dm32, wk0.x0, wk0.φ, CPU())
+
+
+@time Dolo.dF_2!(L0_gpu, dm32, wk.x0, wk.φ, t_e)
+
+Lg = adapt(Array, L0_gpu)
+
+
+@time L*wk0.x0;
+
+#It works !
+using BenchmarkTools
+@benchmark Dolo.mul!(wk0.r0, L, wk0.x0, CPU())
+@benchmark Dolo.mul!(wk.r0, L0_gpu, wk.x0, t_e)
 # try
 
     # res = fun_(wk.r0, dm32, wk.x0, wk.φ; ndrange=(p,q))
 # catch err
     # nothing
 # end
 
-wk0 = Dolo.time_iteration_workspace(dm32)
-wk = adapt(CuArray, wk0)
-@time Dolo.time_iteration(dm32, wk, verbose=false; engine=:gpu);
+wk0 = Dolo.time_iteration_workspace(dm32; interp_mode=:cubic)
+# wk = adapt(CuArray, wk0)
+wk = adapt(oneArray, wk0)
+
+@time Dolo.time_iteration(dm32, wk, verbose=true, improve=true; engine=:gpu, improve_wait=10, improve_K=500, T=20);
+
+
+
+
+@time Dolo.time_iteration(dm32, wk0, verbose=true, improve=false;  T=20);
+
+
+
 
-@time Dolo.time_iteration(dm32, wk0, verbose=false; engine=:gpu);
+using ForwardDiff
 
+ForwardDiff.derivative(u->unsafe_trunc(u), 0.1)
 
-Dolo.distance(adapt(Array,wk.x0) , wk0.x0)
+ForwardDiff.jacobian(u->trunc.(Int, u), SVector(0.3, 0.2))
 
-# 357 times faster !
+ForwardDiff.jacobian(u->unsafe_trunc.(Int, u), SVector(0.3, 0.2))
diff --git a/src/adapt.jl b/src/adapt.jl
@@ -31,9 +31,14 @@ function adapt_structure(to, f::DFun{Dom, Gar, Itp, vars}) where Dom where Gar w
     )
 end
 
-import KernelAbstractions: get_backend
+function adapt_structure(to, L::LL{G,D,F}) where G where D where F
+    LL(L.grid, adapt(to, L.D), adapt(to, L.φ))
+end
+
 
-get_backend(g::GArray) = get_backend(g.data)
+# should it be merged with the general definition?
+# import CUDA: CuArray
+# distance(x::GVector{G, A}, y::GVector{G,A}) where G where A<:CuArray = Base.mapreduce(u->maximum(u), max, x.data-y.data)
 
-import CUDA: CuArray
-distance(x::GVector{G, A}, y::GVector{G,A}) where G where A<:CuArray = Base.mapreduce(u->maximum(u), max, x.data-y.data)
+import oneAPI: oneArray
+distance(x::GVector{G, A}, y::GVector{G,A}) where G where A<:oneArray = Base.mapreduce(u->maximum(u), max, x.data-y.data)
diff --git a/src/algos/time_iteration.jl b/src/algos/time_iteration.jl
@@ -33,7 +33,7 @@ function F(dmodel::M, s::QP, x::SVector{d,T}, φ::Union{Policy, GArray, DFun}) w
     # )
     r
     # r::SVector{d,T}
-    r = complementarities(dmodel.model, s,x,r)
+    # r = complementarities(dmodel.model, s,x,r)
     r
 end
 
@@ -128,7 +128,7 @@ end
 using LinearMaps
 using Adapt
 
-function time_iteration_workspace(dmodel; interp_mode=:linear, dest=Array)
+function time_iteration_workspace(dmodel; interp_mode=:linear, improve=false, dest=Array)
 
     T = eltype(dmodel)
 
@@ -146,13 +146,14 @@ function time_iteration_workspace(dmodel; interp_mode=:linear, dest=Array)
     vars = variables(dmodel.model.controls)
     φ = DFun(dmodel.model.states, x0, vars; interp_mode=interp_mode)
 
-    tt = (;x0, x1, x2, r0, dx, J, φ)
+    # if improve
+        L = Dolo.dF_2(dmodel, x1, φ)
+        tt = (;x0, x1, x2, r0, dx, J, L, φ)
+    # else
+        # tt = (;x0, x1, x2, r0, dx, J, φ)
+    # end
 
-    res = NamedTuple(
-        ( (k=>adapt(dest, v)) for (k,v) in zip(keys(tt),values(tt)) )
-    )
-    return res
-    # return adapt(dest, tt)
+    return adapt(dest, tt)
 
 end
 
@@ -174,6 +175,7 @@ function time_iteration(model::YModel; kwargs...)
     time_iteration(dmodel, wksp; kwargs2...)
 end
 
+using oneAPI: oneArray
 function time_iteration(model::DYModel,
     workspace=time_iteration_workspace(model);
     T=500,
@@ -199,14 +201,18 @@ function time_iteration(model::DYModel,
 
     # mem = typeof(workspace) <: Nothing ? time_iteration_workspace(model) : workspace
     mbsteps = 5
-    lam = 0.5
+
+    (;x0, x1, x2, dx, r0, J, φ) = workspace
+
 
     local η_0 = NaN
     local η
     convergence = false
     iterations = T
 
-    (;x0, x1, x2, dx, r0, J, φ) = workspace
+    Tf = eltype(eltype(x0))
+
+    lam = convert(Tf, 0.5)
 
     if engine in (:cpu, :gpu)
         t_engine = get_backend(workspace.x0)
@@ -217,7 +223,7 @@ function time_iteration(model::DYModel,
     ti_trace = trace ? IterationTrace(typeof(φ)[]) : nothing
 
     if improve
-        J_2 = Dolo.dF_2(model, x1, φ)
+        J_2 = workspace.L
     end
 
     for t=1:T
@@ -306,12 +312,16 @@ function time_iteration(model::DYModel,
             Dolo.dF_1!(J_1, model, x1, φ, t_engine)
             Dolo.dF_2!(J_2, model, x1, φ, t_engine)
 
-            mul!(J_2, -1.0)
+            mul!(J_2, convert(Tf,-1.0))
+
             # J_2.M_ij[:] *= -1.0
             Tp = J_1 \ J_2
 
             d = (x1-x0)
-            x0.data .+= neumann(Tp, d; K=improve_K)
+
+            rhs =  neumann(Tp, d; K=improve_K)
+
+            x0.data .+= rhs.data
 
         else
             x0.data .= x1.data