From 596e534912da3a3af55b2b8e9644ebcc013f9c2a Mon Sep 17 00:00:00 2001
From: Pablo Winant <pablo.winant@gmail.com>
Date: Tue, 7 May 2024 20:21:04 +0200
Subject: [PATCH 01/17] WIP

---
 Project.toml     |   4 +-
 misc/test_gpu    |   0
 misc/test_gpu.jl | 340 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 341 insertions(+), 3 deletions(-)
 create mode 100644 misc/test_gpu
 create mode 100644 misc/test_gpu.jl

diff --git a/Project.toml b/Project.toml
index 35ec5134..7a3a4786 100644
--- a/Project.toml
+++ b/Project.toml
@@ -7,13 +7,13 @@ Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 AxisArrays = "39de3d68-74b9-583c-8d2d-e117c070f3a9"
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 BlockDiagonals = "0a1fb500-61f7-11e9-3c65-f5ef3456f9f0"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Continuables = "79afa230-ca09-11e8-120b-5decf7bf5e25"
 Crayons = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 Dolang = "e5c7262c-e9d2-5620-ad8e-1af14eb8a8e3"
-Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 FiniteDiff = "6a86dc24-6348-571c-b903-95158fe2bd41"
 FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
 Formatting = "59287772-0a20-5a39-b81b-1366585eb4c0"
@@ -35,7 +35,6 @@ Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 ProgressLogging = "33c8b6b6-d38a-422a-b730-caa89a2f386c"
 RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01"
 ResumableFunctions = "c5292f4c-5179-55e1-98c5-05642aab7184"
-ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
 Roots = "f2b01f46-fcfa-551c-844a-d8ac1e96c665"
 SIMD = "fdea26ae-647d-5447-a871-4b548cad5224"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
@@ -43,4 +42,3 @@ StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 StringDistances = "88034a9c-02f8-509d-84a9-84ec65e18404"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6"
-oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
diff --git a/misc/test_gpu b/misc/test_gpu
new file mode 100644
index 00000000..e69de29b
diff --git a/misc/test_gpu.jl b/misc/test_gpu.jl
new file mode 100644
index 00000000..8866f912
--- /dev/null
+++ b/misc/test_gpu.jl
@@ -0,0 +1,340 @@
+using Dolo
+using StaticArrays
+using BenchmarkTools
+import Dolo.splines: CubicInterpolator
+using LoopVectorization
+# using CUDAKernels
+using KernelAbstractions
+using KernelAbstractions: synchronize
+using KernelAbstractions: @kernel, @index, @Const
+using Adapt: adapt
+using CUDA
+import Adapt: adapt_structure, adapt
+import Adapt
+
+# using CUDAKernels
+using KernelAbstractions
+using KernelAbstractions: @kernel, @index, @Const
+using CUDA
+
+
+grid = (
+    (0.0,1.0,20),
+    (0.0,1.0,20),
+    (0.0,1.0,20)
+)
+
+
+# C = rand(SVector{2, Float64}, 22,22,22)
+C = rand(22,22,22)
+
+spl = Dolo.SplineInterpolator{typeof(grid), typeof(C), 3}(grid, C)
+
+# function test(spl; N=100000)
+
+#     out = zeros(eltype(spl.θ), N)
+#     for n=1:N
+#         t = sin(n)/2.0+1.0
+#         x = SVector(t,1-t,t^2)
+#         r = spl(x)
+#         out[n] = r
+#     end
+
+#     return sum(out)
+
+# end
+
+
+# @benchmark  test(spl)
+
+
+# function test2(spl; N=100000)
+
+#     out = zeros(N)
+#     @tturbo for n=1:N
+#         t = sin(n)/2.0+1.0
+#         x = SVector(t,1-t,t^2)
+#         r = spl(x)
+#         out[n] = r
+#         # out[n] = r
+#     end
+
+#     return sum(out)
+
+# end
+
+# @benchmark test2(spl)
+
+# # using CUDAKernels
+
+# @kernel function ker_(out,(spl))
+#     n = @index(Global)
+#     t = sin(n)/2.0+1.0
+#     x = SVector(t,1-t,t^2)
+#     r = spl(x)
+#     out[n] = r
+# end
+
+# function test3(spl; N=100000)
+
+#     out = zeros(N)
+
+#     # backend = get_backend(out)
+
+#     kernel! = ker_(CPU())
+
+#     done = kernel!(out, spl, ndrange=N)
+#     wait(done)
+
+#     return sum(out)
+
+# end
+
+# @benchmark test3(spl)
+
+
+# function test4(spl; N=100000)
+
+#     out = CUDA.zeros(eltype(spl.θ), N)
+
+#     kernel! = ker_(CUDADevice())
+
+#     done = kernel!(out, spl, ndrange=N)
+#     wait(done)
+
+#     return sum(out)
+
+# end
+
+
+adapt_structure(to, s::Dolo.SplineInterpolator{G,C,k}) where G where C where k = let
+    θθ = adapt(to, s.θ)
+    CC = typeof(θθ)
+    Dolo.SplineInterpolator{G,CC,k}(s.grid, θθ)
+end
+
+
+gspl = adapt(CuArray, spl)
+
+# test4(gspl)
+
+# @benchmark test4(gspl)
+
+
+function orphan(model::DoloYAML.Model{ID, Dom}) where ID where Dom
+    P = typeof(model.parameters)
+    DoloYAML.DummyModel{ID, Dom, P}(model.parameters)
+end
+
+function orphan(model::Dolo.YModel)
+    Dolo.YModel(
+        Dolo.name(model),
+        model.states,
+        model.controls,
+        model.exogenous,
+        model.calibration,
+        orphan(model.source)
+    )
+end
+
+
+function orphan(model::Dolo.DYModel)
+    Dolo.DYModel(
+        orphan(model.model),
+        model.grid,
+        model.dproc
+    )
+end
+
+
+
+root_dir = pkgdir(Dolo)
+model = include("$(root_dir)/examples/ymodels/rbc_mc.jl")
+
+
+dmodel = Dolo.discretize(model)
+
+
+
+# model2 = orphan(model)
+# dmodel2 = orphan(dmodel)
+
+
+x0 = Dolo.GVector(dmodel.grid, [Dolo.calibrated(model, :controls) for i=1:length(dmodel.grid)])
+
+φ = Dolo.DFun(model, x0)
+
+Dolo.F(dmodel, x0, φ)
+
+Dolo.calibrated(model, :controls)
+
+import Adapt: adapt_storage
+
+function adapt_structure(to, v::Dolo.GVector{G,V}) where G where V
+    data = adapt(to, v.data)
+    Dolo.GVector{G, typeof(data)}(
+        v.grid,
+        data
+    )
+end
+function adapt_structure(to, f::Dolo.DFun{Dom, Gar, Itp, vars}) where Dom where Gar where Itp where vars
+    itp = adapt(to, f.itp)
+    values = adapt(to, f.values)
+    Dolo.DFun{Dom, typeof(values), typeof(itp), vars}(
+        f.domain,
+        values,
+        itp
+    )
+end
+
+@kernel function ker_F!(out, dmodel, x0, φ)
+
+    n = @index(Global, Linear)
+
+    i,j = Dolo.from_linear(dmodel.grid, n)
+    # I = @index(Global, Cartesian)
+
+    s0 = Dolo.QP( (i,j), dmodel.grid[n])
+    x = x0[n]
+
+    r = Dolo.F(dmodel, s0, x, φ)
+
+    out[n] = r
+
+    # @synchronize
+end
+
+# tt = typeof(model)
+
+
+# struct MyType{A,B}
+#     a::A
+#     b::B
+# end
+
+# mm = MyType((1,2,3),(:a, :b))
+
+
+# source = :(MyType{Tuple{Int64, Int64, Int64}, Tuple{Symbol, Symbol}})
+# dest = :(MyType{Tuple{Int64, Int64, Int64}})
+
+
+# t_source = eval(source)
+# t_dest = eval(dest)
+
+
+function F_cpu(dmodel, x0, φ, K=1000)
+
+    backend = CPU()
+
+    r0 = x0*0                     # TODO
+
+    # kernel! = ker_F(CUDADevice())
+    kernel! = ker_F!(backend)
+
+    # N = length(dmodel.grid)
+    N1 = length(dmodel.grid.g1)
+    N2 = length(dmodel.grid.g2)
+    for k=1:K
+        kernel!(r0, dmodel, x0, φ; ndrange=(N1,N2))
+    end
+
+    # @synchronize
+    r0 = adapt(Array, r0)
+
+    return sum(sum(r0.data))
+    
+end
+
+@time res = F_cpu(dmodel, x0, φ);
+
+
+
+function F_gpu(dmodel, x0, φ, K=1000)
+    
+    backend = CUDABackend()
+    r0 = x0*0                     # TODO
+    r0_gpu = adapt(CuArray, r0)   # TODO
+    x0_gpu = adapt(CuArray, x0)   # TODO
+    φ_gpu = adapt(CuArray, φ)
+
+    # kernel! = ker_F(CUDADevice())
+    kernel! = ker_F!(backend)
+
+    # N = length(dmodel.grid)
+    N1 = length(dmodel.grid.g1)
+    N2 = length(dmodel.grid.g2)
+
+    for k=1:K
+        kernel!(r0_gpu, dmodel, x0_gpu, φ_gpu, ndrange=(N1,N2))
+    end
+    r0 = adapt(Array, r0_gpu)
+    return sum(sum(r0))
+
+end
+
+@time res1 = F_cpu(dmodel, x0, φ);
+@time res2 = F_gpu(dmodel, x0, φ);
+
+
+
+
+
+
+
+
+function F_fast!(r0, dmodel, x0, φ)
+
+    dm = orphan(dmodel)
+    backend = KernelAbstractions.get_backend(x0.data)
+
+    @kernel function ker_FF!(out, dmodel, x0, φ)
+
+        n = @index(Global, Linear)
+
+        i,j = Dolo.from_linear(dmodel.grid, n)
+        
+        s0 = Dolo.QP( (i,j), dmodel.grid[n])
+        x = x0[n]
+        r = Dolo.dF_1(dmodel, s0, x, φ)
+        out[n] = r
+    end
+
+    kernel! = ker_FF!(backend)
+
+    N = length(dm.grid)
+    kernel!(r0, dm, x0, φ, ndrange=N)
+
+end
+
+
+
+r0 = Dolo.dF_1(dmodel, x0, φ)
+r0_gpu = Adapt.adapt(CuArray,r0)
+φ_gpu = Adapt.adapt(CuArray,φ)
+x0_gpu = Adapt.adapt(CuArray,x0)
+
+
+function timeit(r0, φ, x0; K=1000)
+    for k=1:K
+        F_fast!(r0, dmodel, x0, φ)
+    end
+    sum( sum(r0.data) )
+end
+
+@time timeit(r0, φ, x0; K=1)
+@time timeit(r0_gpu, φ_gpu, x0_gpu; K=1)
+
+@time sum(r0_gpu.data)
+
+
+function timeit_FF(r0, φ, x0; K=1000)
+    for k=1:K
+        FF!(r0, dmodel, x0, φ)
+    end
+    sum( sum(r0.data) )
+end
+
+@time timeit_FF(r0, φ, x0; K=1)
+
+@profview timeit_FF(r0, φ, x0; K=1)

From cb30d7a178981a2ba7151a4a46b8b778d19e95a4 Mon Sep 17 00:00:00 2001
From: Pablo Winant <pablo.winant@gmail.com>
Date: Mon, 20 May 2024 16:00:23 +0200
Subject: [PATCH 02/17] WIP

---
 Project.toml      |  4 ++++
 src/dolo_model.jl |  7 +++++++
 src/grids.jl      | 10 +++++-----
 src/processes.jl  | 15 ++++++++-------
 src/space.jl      | 34 +++++++++++++++++-----------------
 5 files changed, 41 insertions(+), 29 deletions(-)

diff --git a/Project.toml b/Project.toml
index 7a3a4786..971d1891 100644
--- a/Project.toml
+++ b/Project.toml
@@ -3,6 +3,7 @@ uuid = "9d24351c-2990-5e1b-a277-04c4b809c898"
 version = "0.5.0-alpha"
 
 [deps]
+AbstractDifferentiation = "c29ec348-61ec-40c8-8164-b8c60e9d9f3d"
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 AxisArrays = "39de3d68-74b9-583c-8d2d-e117c070f3a9"
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
@@ -14,6 +15,7 @@ DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 Dolang = "e5c7262c-e9d2-5620-ad8e-1af14eb8a8e3"
+Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 FiniteDiff = "6a86dc24-6348-571c-b903-95158fe2bd41"
 FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
 Formatting = "59287772-0a20-5a39-b81b-1366585eb4c0"
@@ -42,3 +44,5 @@ StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 StringDistances = "88034a9c-02f8-509d-84a9-84ec65e18404"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6"
+Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
+oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
diff --git a/src/dolo_model.jl b/src/dolo_model.jl
index 6e37972a..3cb0e7ba 100644
--- a/src/dolo_model.jl
+++ b/src/dolo_model.jl
@@ -15,6 +15,13 @@ name(::YModel{C,A,B,D,N}) where C where A where B where D where N = N
 
 bounds(model::YModel, s) = model.controls
 
+
+# TODO: check somewhere that the type of
+#  states/controls/exogenous
+# is the same as calibration
+
+eltype(model::YModel) = eltype(model.calibration)
+
 function recalibrate(model::YModel; kwargs...)
     calib = merge(model.calibration, kwargs)
     YModel(model.states, model.controls, model.exogenous, calib, model.source)
diff --git a/src/grids.jl b/src/grids.jl
index f514eda3..3d033802 100644
--- a/src/grids.jl
+++ b/src/grids.jl
@@ -6,8 +6,8 @@ import Base: eltype, iterate, size
 eltype(cg::AGrid{d}) where d = SVector{d, Float64}
 ndims(cg::AGrid{d}) where d = d
 
-struct CGrid{d} <: AGrid{d}
-    ranges::NTuple{d, Tuple{Float64, Float64, Int64}}
+struct CGrid{d,Tf} <: AGrid{d}
+    ranges::NTuple{d, Tuple{Tf, Tf, Int64}}
 end
 
 size(cg::CGrid{d}) where d = tuple((e[3] for e in cg.ranges)... )
@@ -39,8 +39,8 @@ getindex(g::CGrid{1}, ::Colon) = [SVector(i) for i in range(g.ranges[1]...)]
 
 
 
-struct SGrid{n,d} <: ASGrid{d}
-    points::SVector{n,SVector{d, Float64}}
+struct SGrid{n,d,T} <: ASGrid{d}
+    points::SVector{n,SVector{d, T}}
 end
 
 function SGrid(Q::Matrix)
@@ -134,7 +134,7 @@ end
 iterate(s::SGrid) = (s.points[1], 2)
 iterate(s::SGrid, state) = state<=length(s) ? (s.points[state], state+1) : nothing
 
-eltype(cg::CGrid{d}) where d = SVector{d, Float64}
+eltype(cg::CGrid{d,Tf}) where d where Tf = SVector{d, Tf}
 
 function iterate(cg::CGrid{d}) where d
     
diff --git a/src/processes.jl b/src/processes.jl
index 44f4c461..d06e22fa 100644
--- a/src/processes.jl
+++ b/src/processes.jl
@@ -302,9 +302,9 @@ end
 
 discretize(var::VAR1, d::Dict) = length(d)>=1 ? discretize(var, d[:n]) : discretize(var)
 
-struct MarkovChain{names, d, d2, k}
-    P::SMatrix{d,d,Float64,d2}
-    Q::SVector{d, SVector{k, Float64}}
+struct MarkovChain{names, d, d2, k, Tf}
+    P::SMatrix{d,d,Tf,d2}
+    Q::SVector{d, SVector{k, Tf}}
 end
 
 variables(mc::MarkovChain{names}) where names = names
@@ -313,14 +313,15 @@ ndims(mc::MarkovChain{names}) where names = length(names)
 # MarkovChain(names, P, Q) = MarkovChain{names, typeof(P), typeof(Q)}(P,Q)
 
 function MarkovChain(names, P::Matrix, Q::Matrix) 
+    Tf = eltype(P)
     d = size(P,1)
-    sm = SMatrix{d,d,Float64,d*d}(P)
+    sm = SMatrix{d,d,Tf,d*d}(P)
     sv = SVector( tuple( ( SVector(Q[i,:]...) for i in 1:size(Q,1))...)  )
-    MarkovChain{names,d,d^2,length(sv[1])}(sm,sv)
+    MarkovChain{names,d,d^2,length(sv[1]),Tf}(sm,sv)
 end
 
-MarkovChain(names, P::SMatrix, Q::SVector{d,SVector{k,Float64}}) where d where k = MarkovChain{names, size(P,1), length(P), length(Q[1])}(P, Q) # TODO: specify type arguments
-function MarkovChain(P::SMatrix, Q::SVector{d,SVector{k,Float64}}) where d where k
+MarkovChain(names, P::SMatrix, Q::SVector{d,SVector{k,Tf}}) where d where k where Tf = MarkovChain{names, size(P,1), length(P), length(Q[1]),Tf}(P, Q) # TODO: specify type arguments
+function MarkovChain(P::SMatrix, Q::SVector{d,SVector{k,Tf}}) where d where k where Tf
     names = tuple((Symbol(string("e", i)) for i=1:k)...)
     MarkovChain(names, P, Q)
 end
diff --git a/src/space.jl b/src/space.jl
index 4e6a3083..33a0332d 100644
--- a/src/space.jl
+++ b/src/space.jl
@@ -1,31 +1,31 @@
 abstract type Space{d} end
 
-struct CartesianSpace{d,dims}
+struct CartesianSpace{d,dims,Tf}
     # names::NTuple{d, Symbol}
-    min::NTuple{d, Float64}
-    max::NTuple{d, Float64}
+    min::NTuple{d, Tf}
+    max::NTuple{d, Tf}
 end
 
 const CSpace = CartesianSpace
 
-CartesianSpace(a::Tuple{Float64}, b::Tuple{Float64}) = CartesianSpace{length(a), (:x,)}(a,b)
-CartesianSpace(a::Tuple{Float64, Float64}, b::Tuple{Float64, Float64}) = CartesianSpace{length(a), (:x_1, :x_2)}(a,b)
+CartesianSpace(a::Tuple{Tf}, b::Tuple{Tf}) where Tf = CartesianSpace{length(a), (:x,), Tf}(a,b)
+CartesianSpace(a::Tuple{Tf, Tf}, b::Tuple{Tf, Tf}) where Tf = CartesianSpace{length(a), (:x_1, :x_2), Tf}(a,b)
 
-function CartesianSpace(kwargs::Pair{Symbol, Tuple{Float64, Float64}}...)
+function CartesianSpace(kwargs::Pair{Symbol, Tuple{Tf, Tf}}...) where Tf
     
     names = tuple( (e[1] for e in kwargs)... )
     a = tuple( (v[2][1] for v in values(kwargs))... )
     b = tuple( (v[2][2] for v in values(kwargs))... )
     d = length(names)
 
-    return CartesianSpace{d, names}(a,b)
+    return CartesianSpace{d, names, Tf}(a,b)
 
 end
 
 CartesianSpace(;kwargs...) = CartesianSpace(kwargs...)
 
 
-getindex(cs::CartesianSpace{d}, ind::SVector{d, Float64}) where d = ind
+getindex(cs::CartesianSpace{d}, ind::SVector{d, Tf}) where d where Tf= ind
 
 import Base: in
 
@@ -46,15 +46,15 @@ ndims(dom::CartesianSpace{d, dims}) where d where dims = d
 variables(dom::CartesianSpace{d,t}) where d where t = t
 dims(dom::CartesianSpace) = variables(dom)
 
-struct GridSpace{N,d,dims}
-    points::SVector{N,SVector{d,Float64}}
+struct GridSpace{N,d,dims,Tf}
+    points::SVector{N,SVector{d,Tf}}
 end
 
 const GSpace = GridSpace
 
-GridSpace(v::SVector{N, SVector{d, Float64}}) where d where N = GridSpace{length(v), d, (:i_,)}(SVector(v...))
-GridSpace(v::Vector{SVector{d, Float64}}) where d = GridSpace{length(v), d, (:i_,)}(SVector(v...))
-GridSpace(names, v::SVector{k, SVector{d, Float64}}) where k where d = GridSpace{length(v), d, names}(v)
+GridSpace(v::SVector{N, SVector{d, Tf}}) where d where N where Tf = GridSpace{length(v), d, (:i_,), Tf}(SVector(v...))
+GridSpace(v::Vector{SVector{d, Tf}}) where d where Tf = GridSpace{length(v), d, (:i_,), Tf}(SVector(v...))
+GridSpace(names, v::SVector{k, SVector{d, Tf}}) where k where d where Tf = GridSpace{length(v), d, names, Tf}(v)
 
 getindex(gs::GridSpace, i::Int64) = gs.points[i]
 
@@ -111,8 +111,8 @@ function dropnames(namedtuple::NamedTuple, names::Tuple{Vararg{Symbol}})
 end
 
 
-QP(space::CartesianSpace{d}; values...) where d = let
-    s_ =zero(SVector{d,Float64})*NaN
+QP(space::CartesianSpace{d, Tf}; values...) where d where Tf = let
+    s_ =zero(SVector{d,Tf})*NaN
     s0 = QP(s_,s_)
     QP(space, s0; values...)
 end
@@ -136,10 +136,10 @@ QP(space::GridSpace{d}; i_=1) where d = QP(i_,space[i_])
 
 
 
-function QP(space::ProductSpace{<:GridSpace{d1},<:CartesianSpace{d2}}; values...) where d1 where d2
+function QP(space::ProductSpace{<:GridSpace{d1},<:CartesianSpace{d2, Tf}}; values...) where d1 where d2 where Tf
     i0 = get(values, :i_, 1)
     m_ = space.spaces[1][i0]
-    s_ =zero(SVector{d2,Float64})*NaN
+    s_ =zero(SVector{d2,Tf})*NaN
     s0 = QP((i0,s_),SVector(m_...,s_...))
     QP(space, s0; values...)
 end

From 33dad289c3c54b7c4cc4cfabe2efc6ea8673fbab Mon Sep 17 00:00:00 2001
From: Pablo Winant <pablo.winant@gmail.com>
Date: Mon, 20 May 2024 16:00:51 +0200
Subject: [PATCH 03/17] WIP

---
 misc/dev_float32.jl |   7 ++
 misc/rbc_base.jl    | 179 ++++++++++++++++++++++++++++++++++++++++++++
 misc/rbc_float32.jl |  89 ++++++++++++++++++++++
 3 files changed, 275 insertions(+)
 create mode 100644 misc/dev_float32.jl
 create mode 100644 misc/rbc_base.jl
 create mode 100644 misc/rbc_float32.jl

diff --git a/misc/dev_float32.jl b/misc/dev_float32.jl
new file mode 100644
index 00000000..765415ae
--- /dev/null
+++ b/misc/dev_float32.jl
@@ -0,0 +1,7 @@
+using Dolo
+
+root_dir = pkgdir(Dolo)
+model = include("$(root_dir)/examples/ymodels/rbc_mc.jl")
+
+dm = Dolo.discretize(model, Dict(:endo=>[10000000]) )
+
diff --git a/misc/rbc_base.jl b/misc/rbc_base.jl
new file mode 100644
index 00000000..52543b17
--- /dev/null
+++ b/misc/rbc_base.jl
@@ -0,0 +1,179 @@
+
+using StaticArrays
+import Dolo: transition, arbitrage
+import Dolo
+
+model = let 
+
+    name = :rbc_mc
+
+    # calibrate some parameters
+    β = 0.99
+    σ = 5
+    η = 1
+    δ = 0.025
+    α = 0.33
+    ρ = 0.8
+    zbar = 0.0
+    σ_z = 0.016
+    n = 0.33
+    z = zbar
+    rk = 1/β - 1+δ
+    k = n/(rk/α)^(1/(1-α))
+    w = (1-α)*exp(z)*(k/n)^α
+    y = exp(z)*k^α*n^(1-α)
+    i = δ*k
+    c = y - i
+    χ =  w/c^σ/n^η
+
+    calibration = (;β, σ, η, δ, α, ρ, z, n, k, w, y, i, c, χ)
+
+
+    P = @SMatrix [0.4 0.6; 0.6 0.4]
+	# Q = @SMatrix [-0.01; 0.01]
+    Q = SVector( SVector(-0.01), SVector(0.01) )
+
+    # P = @SMatrix [1.0;]
+    # Q = SVector( (SVector(0.0),) )
+        
+    process = Dolo.MarkovChain( (:z,), P, Q )
+
+    states = Dolo.ProductSpace(
+        Dolo.GridSpace((:z,), Q),
+        Dolo.CartesianSpace(;
+            :k => ( k*0.9, k*1.1),
+        )
+    )
+    controls = Dolo.CartesianSpace(;
+        :i => (0.0, 10.0),
+        :n => (0.0, 1.5)
+    )
+
+    # calibration = (;α, β, γ, δ, ρ,χ, η = 2.0, σ = 2.0, i=0.1, n=0.8)
+
+
+    Dolo.YModel(name, states, controls, process, calibration)
+
+end
+
+
+using LoopVectorization
+dmodel = Dolo.discretize(model, Dict(:endo=>10000000))
+
+(;φ, x0) = Dolo.time_iteration_workspace(dmodel)
+
+function FF!(r0, dm, x0,  φ)
+
+    MM = length(dm.grid.g1)
+    a,b,NN = dm.grid.g2.ranges[1]
+    g1 = dm.grid.g1
+    g2 = dm.grid.g2
+
+    p = dm.model.calibration
+    
+    # (;β, σ, η, δ, α, ρ, χ) = dm.model.calibration
+
+    P = dm.model.exogenous.P
+
+    itps = φ.itp
+
+    for i_m=1:MM
+
+        z = g1.points[i_m][1]
+
+        for n_=1:NN
+
+            i,n = x0[i_m, n_]
+
+            k = g2[n_][1] # a+(b-a)*(n_-1)/(N-1)
+
+            y = exp(z)*(k^p.α)*(n^(1-p.α))
+            w = (1-p.α)*y/n
+
+            c = y - i
+
+            res1 = 0.0
+            res2 = 0.0
+
+            for i_M=1:MM
+
+                prob = P[i_m,i_M]
+                Z = g1.points[i_M][1]
+                
+                
+                itp = itps[i_M]
+                
+                
+                K = k*(1-p.δ) + i
+
+                I,N = itp(K)
+                Y = exp(Z)*(K^p.α)*(N^(1-p.α))
+                C = Y - I
+
+                res1 += prob*(p.χ*(n^p.η)*(c^p.σ) - w)
+                res2 += prob*(p.β*(C/c)^(-p.σ)*(1-p.δ+exp(Z)*p.α*K^(p.α-1)*N^(1-p.α))-1)
+
+            end
+
+            r0[i_m,n_] = SVector(res1,res2)
+
+        end
+    end
+
+end
+
+
+function Dolo.transition(model::typeof(model), s::NamedTuple, x::NamedTuple, M::NamedTuple)
+    
+    (;δ, ρ) = model.calibration
+    
+    # Z = e.Z
+    K = s.k * (1-δ) + x.i
+
+    (;k=K,)  ## This is only the endogenous state
+
+end
+
+
+
+function intermediate(model::typeof(model),s::NamedTuple, x::NamedTuple)
+    
+    p = model.calibration
+
+	y = exp(s.z)*(s.k^p.α)*(x.n^(1-p.α))
+	w = (1-p.α)*y/x.n
+	rk = p.α*y/s.k
+	c = y - x.i
+	return ( (; y, c, rk, w))
+
+end
+
+
+function arbitrage(model::typeof(model), s::NamedTuple, x::NamedTuple, S::NamedTuple, X::NamedTuple)
+
+    p = model.calibration
+
+	y = intermediate(model, s, x)
+	Y = intermediate(model, S, X)
+	res_1 = p.χ*(x.n^p.η)*(y.c^p.σ) - y.w
+	res_2 = (p.β*(y.c/Y.c)^p.σ)*(1 - p.δ + Y.rk) - 1
+    
+    return ( (;res_1, res_2) )
+
+end
+
+
+r0 = deepcopy(x0*0)
+r1 = deepcopy(x0*0)
+
+
+using BenchmarkTools
+
+
+# not faster !
+@time FF!(r0, dmodel, x0, φ)
+@time Dolo.F!(r1, dmodel, x0, φ)
+
+@benchmark FF!(r0, dmodel, x0, φ)
+
+@benchmark Dolo.F!(r1, dmodel, x0, φ)
\ No newline at end of file
diff --git a/misc/rbc_float32.jl b/misc/rbc_float32.jl
new file mode 100644
index 00000000..a8fc0986
--- /dev/null
+++ b/misc/rbc_float32.jl
@@ -0,0 +1,89 @@
+using StaticArrays
+import Dolo: transition, arbitrage
+import Dolo
+
+model = let 
+
+    name = :rbc_mc
+
+    # calibrate some parameters
+    β = 0.99
+    σ = 5.0
+    η = 1.0
+    δ = 0.025
+    α = 0.33
+    ρ = 0.8
+    zbar = 0.0
+    σ_z = 0.016
+    n = 0.33
+    z = zbar
+    rk = 1/β - 1+δ
+    k = n/(rk/α)^(1/(1-α))
+    w = (1-α)*exp(z)*(k/n)^α
+    y = exp(z)*k^α*n^(1-α)
+    i = δ*k
+    c = y - i
+    χ =  w/c^σ/n^η
+
+    calibration = (;β, σ, η, δ, α, ρ, z, n, k, w, y, i, c, χ)
+
+
+    P = @SMatrix [0.4 0.6; 0.6 0.4]
+	# Q = @SMatrix [-0.01; 0.01]
+    Q = SVector( SVector(-0.01), SVector(0.01) )
+
+    # P = @SMatrix [1.0;]
+    # Q = SVector( (SVector(0.0),) )
+        
+    process = Dolo.MarkovChain( (:z,), P, Q )
+
+    states = Dolo.ProductSpace(
+        Dolo.GridSpace((:z,), Q),
+        Dolo.CartesianSpace(;
+            :k => ( k*0.9, k*1.1),
+        )
+    )
+    controls = Dolo.CartesianSpace(;
+        :i => (0.0, 10.0),
+        :n => (0.0, 1.5)
+    )
+
+    # calibration = (;α, β, γ, δ, ρ,χ, η = 2.0, σ = 2.0, i=0.1, n=0.8)
+
+
+    Dolo.YModel(name, states, controls, process, calibration)
+
+end
+
+function convert_model(T, model)
+
+    calibration = NamedTuple( ((a,T(b)) for (a,b) in pairs(model.calibration) ) )
+
+    fun = u->T(u)
+    P = fun.(model.exogenous.P)
+    Q = SVector((fun.(e) for e in model.exogenous.Q)...)
+
+    vars = Dolo.variables(model.exogenous)
+    exogenous = Dolo.MarkovChain(vars, P, Q)
+
+    Dolo.YModel(
+        names,
+        model.states,
+        model.controls,
+        exogenous, 
+        calibration
+    )
+end
+
+model32 = convert_model(Float32,model)
+
+dmodel = Dolo.discretize(model)
+
+
+dmodel32 = Dolo.discretize(model32)
+
+wksp = Dolo.time_iteration_workspace(dmodel32)
+
+(;x0,φ)  = wksp
+
+Dolo.time_iteration(model)
\ No newline at end of file

From d7d1806c9f8c5e7d6d2ddd43b0eb9fb160a80b6b Mon Sep 17 00:00:00 2001
From: Pablo Winant <pablo.winant@gmail.com>
Date: Mon, 20 May 2024 19:10:16 +0200
Subject: [PATCH 04/17] WIP

---
 misc/dev_float32.jl                     |  1 +
 misc/rbc_base.jl                        |  8 ++---
 misc/rbc_float32.jl                     | 28 +++++----------
 misc/test_gpu.jl                        |  8 ++---
 src/Dolo.jl                             |  4 +++
 src/algos/simul.jl                      |  6 ++--
 src/algos/time_iteration.jl             |  4 ++-
 src/algos/time_iteration_accelerated.jl | 12 +++----
 src/conversion.jl                       | 39 +++++++++++++++++++++
 src/dolo_model.jl                       |  4 +++
 src/funs.jl                             | 12 +++----
 src/garray.jl                           | 16 ++++-----
 src/grids.jl                            | 45 ++++++++++++++-----------
 13 files changed, 115 insertions(+), 72 deletions(-)
 create mode 100644 src/conversion.jl

diff --git a/misc/dev_float32.jl b/misc/dev_float32.jl
index 765415ae..e7f54566 100644
--- a/misc/dev_float32.jl
+++ b/misc/dev_float32.jl
@@ -5,3 +5,4 @@ model = include("$(root_dir)/examples/ymodels/rbc_mc.jl")
 
 dm = Dolo.discretize(model, Dict(:endo=>[10000000]) )
 
+Dolo.convert
\ No newline at end of file
diff --git a/misc/rbc_base.jl b/misc/rbc_base.jl
index 52543b17..a29612cd 100644
--- a/misc/rbc_base.jl
+++ b/misc/rbc_base.jl
@@ -64,10 +64,10 @@ dmodel = Dolo.discretize(model, Dict(:endo=>10000000))
 
 function FF!(r0, dm, x0,  φ)
 
-    MM = length(dm.grid.g1)
-    a,b,NN = dm.grid.g2.ranges[1]
-    g1 = dm.grid.g1
-    g2 = dm.grid.g2
+    MM = length(dm.grid.grids[1])
+    a,b,NN = dm.grid.grids[2].ranges[1]
+    g1 = dm.grid.grids[1]
+    g2 = dm.grid.grids[2]
 
     p = dm.model.calibration
     
diff --git a/misc/rbc_float32.jl b/misc/rbc_float32.jl
index a8fc0986..3958bab9 100644
--- a/misc/rbc_float32.jl
+++ b/misc/rbc_float32.jl
@@ -55,35 +55,23 @@ model = let
 
 end
 
-function convert_model(T, model)
 
-    calibration = NamedTuple( ((a,T(b)) for (a,b) in pairs(model.calibration) ) )
+dmodel = Dolo.discretize(model)
 
-    fun = u->T(u)
-    P = fun.(model.exogenous.P)
-    Q = SVector((fun.(e) for e in model.exogenous.Q)...)
 
-    vars = Dolo.variables(model.exogenous)
-    exogenous = Dolo.MarkovChain(vars, P, Q)
+model32 = Dolo.convert_precision(Float32,model)
+dmodel32 = Dolo.discretize(model32)
 
-    Dolo.YModel(
-        names,
-        model.states,
-        model.controls,
-        exogenous, 
-        calibration
-    )
-end
+wksp = Dolo.time_iteration_workspace(dmodel32)
 
-model32 = convert_model(Float32,model)
+itps = wksp.φ.itp
 
-dmodel = Dolo.discretize(model)
+(;x0,r0, φ)  = wksp
+
+Dolo.F(dmodel32, x0, φ)
 
 
-dmodel32 = Dolo.discretize(model32)
 
-wksp = Dolo.time_iteration_workspace(dmodel32)
 
-(;x0,φ)  = wksp
 
 Dolo.time_iteration(model)
\ No newline at end of file
diff --git a/misc/test_gpu.jl b/misc/test_gpu.jl
index 8866f912..c41c933e 100644
--- a/misc/test_gpu.jl
+++ b/misc/test_gpu.jl
@@ -233,8 +233,8 @@ function F_cpu(dmodel, x0, φ, K=1000)
     kernel! = ker_F!(backend)
 
     # N = length(dmodel.grid)
-    N1 = length(dmodel.grid.g1)
-    N2 = length(dmodel.grid.g2)
+    N1 = length(dmodel.grid.grids[1])
+    N2 = length(dmodel.grid.grids[2])
     for k=1:K
         kernel!(r0, dmodel, x0, φ; ndrange=(N1,N2))
     end
@@ -262,8 +262,8 @@ function F_gpu(dmodel, x0, φ, K=1000)
     kernel! = ker_F!(backend)
 
     # N = length(dmodel.grid)
-    N1 = length(dmodel.grid.g1)
-    N2 = length(dmodel.grid.g2)
+    N1 = length(dmodel.grid.grids[1])
+    N2 = length(dmodel.grid.grids[2])
 
     for k=1:K
         kernel!(r0_gpu, dmodel, x0_gpu, φ_gpu, ndrange=(N1,N2))
diff --git a/src/Dolo.jl b/src/Dolo.jl
index 6e4afd66..75ac7a0d 100755
--- a/src/Dolo.jl
+++ b/src/Dolo.jl
@@ -73,12 +73,16 @@ module Dolo
     include("garray.jl")
     include("model.jl")
     include("funs.jl")
+    include("conversion.jl")
+
     include("dev_L2.jl")
     include("algos/simul.jl")
     include("algos/results.jl")
     include("algos/time_iteration.jl")
     include("algos/time_iteration_accelerated.jl")
     include("algos/vfi.jl")
+
+
     include("utils.jl")
 
     # function yaml_import(filename)
diff --git a/src/algos/simul.jl b/src/algos/simul.jl
index 84130aa1..d31bc495 100644
--- a/src/algos/simul.jl
+++ b/src/algos/simul.jl
@@ -93,7 +93,7 @@ using ResumableFunctions
 
     p = model.calibration.p
     P = model.transition
-    Q = model.grid.g1.points
+    Q = model.grid.grids[1].points
 
     i = ss[1][1]
 
@@ -113,7 +113,7 @@ using ResumableFunctions
         M = Q[j]
         S = transition(model, m, s, a, M, p)
 
-        for (w, i_S) in trembling__hand(model.grid.g2, S)
+        for (w, i_S) in trembling__hand(model.grid.grids[2], S)
 
             res = (
                 P[i,j]*w,
@@ -121,7 +121,7 @@ using ResumableFunctions
                 (
                     (linear_index ? to__linear_index(model.grid, (j,i_S)) : (j,i_S)),
 
-                    SVector(M..., model.grid.g2[i_S]...)
+                    SVector(M..., model.grid.grids[2][i_S]...)
                 )
             )
             if linear_index
diff --git a/src/algos/time_iteration.jl b/src/algos/time_iteration.jl
index e84a468e..3ecc7159 100644
--- a/src/algos/time_iteration.jl
+++ b/src/algos/time_iteration.jl
@@ -130,6 +130,8 @@ using LinearMaps
 
 function time_iteration_workspace(dmodel; interp_mode=:linear)
 
+    T = eltype(dmodel)
+
     x0 = (Dolo.initial_guess(dmodel))
     x1 = deepcopy(x0)
     x2 = deepcopy(x0)
@@ -139,7 +141,7 @@ function time_iteration_workspace(dmodel; interp_mode=:linear)
     n = length(dx.data[1])
     J = GArray(
         dmodel.grid,
-        zeros(SMatrix{n,n,Float64,n*n}, N)
+        zeros(SMatrix{n,n,T,n*n}, N)
     )
     vars = variables(dmodel.model.controls)
     φ = DFun(dmodel.model.states, x0, vars; interp_mode=interp_mode)
diff --git a/src/algos/time_iteration_accelerated.jl b/src/algos/time_iteration_accelerated.jl
index 67393d82..1f085ef0 100644
--- a/src/algos/time_iteration_accelerated.jl
+++ b/src/algos/time_iteration_accelerated.jl
@@ -63,15 +63,15 @@ function F!(r, model, x, φ, engine::Union{CPU, GPU})
     fun_cpu = FF_(engine)
 
 
-    # p = length(model.grid.g1)
-    # q = length(model.grid.g2)
+    # p = length(model.grid.grids[1])
+    # q = length(model.grid.grids[2])
     # p,q = size(x)
     if typeof(model.grid)<:CGrid
         p = model.grid.ranges[1][3]
         q = model.grid.ranges[2][3]
     else
-        p = length(model.grid.g1)
-        q = length(model.grid.g2)
+        p = length(model.grid.grids[1])
+        q = length(model.grid.grids[2])
     end
     
     res = fun_cpu(r, model, x, φ; ndrange=(p,q))
@@ -108,8 +108,8 @@ function dF_1!(out, model, controls::GArray, φ::Union{GArray, DFun}, ::CPU)
         p = model.grid.ranges[1][3]
         q = model.grid.ranges[2][3]
     else
-        p = length(model.grid.g1)
-        q = length(model.grid.g2)
+        p = length(model.grid.grids[1])
+        q = length(model.grid.grids[2])
     end
     # p,q = size(out)
 
diff --git a/src/conversion.jl b/src/conversion.jl
new file mode 100644
index 00000000..10fea021
--- /dev/null
+++ b/src/conversion.jl
@@ -0,0 +1,39 @@
+
+function convert_precision(T, model::Dolo.YModel)
+
+    # convert calibration
+    calibration = NamedTuple( ((a,T(b)) for (a,b) in pairs(model.calibration) ) )
+
+    # convert exogenous shock
+    fun = u->T(u)
+    P = fun.(model.exogenous.P)
+    Q = SVector((fun.(e) for e in model.exogenous.Q)...)
+
+    vars = Dolo.variables(model.exogenous)
+    exogenous = Dolo.MarkovChain(vars, P, Q)
+
+    # convert states and controls spaces
+    states = convert_precision(T, model.states)
+    controls = convert_precision(T, model.controls)
+
+
+    Dolo.YModel(
+        names,
+        states,
+        controls,
+        exogenous, 
+        calibration
+    )
+end
+
+function convert_precision(Tnew, cspace::Dolo.CartesianSpace{d, vars, Told}) where Told where d where vars
+    Dolo.CartesianSpace{d, vars, Tnew}(cspace.min, cspace.max)
+end
+
+function convert_precision(Tnew, gspace::Dolo.GridSpace{N,d,dims,Tf}) where N where d where dims where Tf
+    Dolo.GridSpace{N,d,dims,Tnew}(gspace.points)
+end
+
+function convert_precision(Tnew, ps::Dolo.ProductSpace)
+    Dolo.ProductSpace((convert_precision(Tnew, e) for e in ps.spaces)...)
+end
diff --git a/src/dolo_model.jl b/src/dolo_model.jl
index 3cb0e7ba..bd76af4a 100644
--- a/src/dolo_model.jl
+++ b/src/dolo_model.jl
@@ -55,6 +55,10 @@ struct DYModel{M, G, D} <: ADModel
     dproc::D
 end
 
+# TODO: check whether true
+eltype(dm::DYModel) = eltype(dm.model)
+
+
 name(dm::DYModel) = name(dm.model)
 
 bounds(dmodel::DYModel, s) = bounds(dmodel.model, s)
diff --git a/src/funs.jl b/src/funs.jl
index eaa5344f..84d01540 100644
--- a/src/funs.jl
+++ b/src/funs.jl
@@ -71,8 +71,8 @@ function DFun(states, values::GVector{G,V}, vars=nothing; interp_mode=:linear) w
     end
 
     # TODO: check values.data[i,:]
-    sz = (e[3] for e in values.grid.g2.ranges)
-    itps = tuple( (SplineInterpolator(values.grid.g2.ranges;  values=reshape(values[i,:], sz...),k=k)  for i=1:length(values.grid.g1)  )...)
+    sz = (e[3] for e in values.grid.grids[2].ranges)
+    itps = tuple( (SplineInterpolator(values.grid.grids[2].ranges;  values=reshape(values[i,:], sz...),k=k)  for i=1:length(values.grid.grids[1])  )...)
 
     if typeof(vars) <: Nothing
         if eltype(values) <: Number
@@ -92,8 +92,8 @@ function fit!(φ::DFun, x::GVector{G}) where G<:PGrid{<:SGrid, <:CGrid}
 
     # This is only for SGrid x CGrid
 
-    n_m = length(x.grid.g1)
-    sz = tuple(n_m, (e[3] for e in x.grid.g2.ranges)...)    
+    n_m = length(x.grid.grids[1])
+    sz = tuple(n_m, (e[3] for e in x.grid.grids[2].ranges)...)    
     xx = reshape( view(x.data, :), sz...)
     rr = tuple( (Colon() for i=1:(Base.ndims(xx)-1))... )
     for i=1:length( φ.itp)
@@ -125,8 +125,8 @@ end
 function (f::DFun{A,B,I,vars})(loc::Tuple{Tuple{Int64}, SVector{d2, U}})  where A where B<:GArray{G,V} where V where I where G<:PGrid{G1,G2} where G1<:SGrid where G2<:CGrid where vars where d2 where U
     # TODO: not beautiful
     x_ = loc[2]
-    dd1 = ndims(f.values.grid.g1)
-    dd2 = ndims(f.values.grid.g2)
+    dd1 = ndims(f.values.grid.grids[1])
+    dd2 = ndims(f.values.grid.grids[2])
     x = SVector((x_[i] for i=(dd1+1):(dd1+dd2))...)
     f.itp[loc[1][1]](x)
 end
diff --git a/src/garray.jl b/src/garray.jl
index 69ad9c6e..d89b7fe8 100644
--- a/src/garray.jl
+++ b/src/garray.jl
@@ -22,20 +22,20 @@ distance(u::SVector, v::SVector) = maximum(abs, u-v)
 distance(u::GVector, v::GVector) = maximum( t->distance(t[1],t[2]), zip(u.data, v.data))
 
 import Base: size
-size(a::GArray{PGrid{G1, G2, d}, T}) where G1 where G2 where d where T = (length(a.grid.g1), length(a.grid.g2))
+size(a::GArray{PGrid{G1, G2, d}, T}) where G1 where G2 where d where T = (length(a.grid.grids[1]), length(a.grid.grids[2]))
 
 
 # TODO: check
-getindex(a::GArray{PGrid{G1, G2, d}, T}, i::Int, j::Int) where G1 where G2 where d where T = a.data[ i + length(a.grid.g1)*(j-1)]
-getindex(a::GArray{PGrid{G1, G2, d}, T}, i::Int, ::Colon) where G1 where G2 where d where T = [a[i,j] for j=1:length(a.grid.g2)]
-getindex(a::GArray{PGrid{G1, G2, d}, T}, ::Colon, j::Int) where G1 where G2 where d where T = [a[i,j] for i=1:length(a.grid.g1)]
+getindex(a::GArray{PGrid{G1, G2, d}, T}, i::Int, j::Int) where G1 where G2 where d where T = a.data[ i + length(a.grid.grids[1])*(j-1)]
+getindex(a::GArray{PGrid{G1, G2, d}, T}, i::Int, ::Colon) where G1 where G2 where d where T = [a[i,j] for j=1:length(a.grid.grids[2])]
+getindex(a::GArray{PGrid{G1, G2, d}, T}, ::Colon, j::Int) where G1 where G2 where d where T = [a[i,j] for i=1:length(a.grid.grids[1])]
 
 getindex(a::GArray{PGrid{G1, G2, d}, T}, ::Colon) where G1 where G2 where d where T = a.data
 
 
 # TODO: check
 function setindex!(a::GArray{PGrid{G1, G2, d}, T}, v, i::Int, j::Int) where G1 where G2 where d where T 
-    (a.data[ i + length(a.grid.g1)*(j-1)] = v)
+    (a.data[ i + length(a.grid.grids[1])*(j-1)] = v)
     nothing
 end
 
@@ -72,8 +72,8 @@ getindex(g::GArray{G,T}, inds::Int64) where G<:AGrid{d} where d where T = g.data
 
 # interpolating indexing
 function (xa::GArray{PGrid{G1, G2, d}, T})(i::Int64, p::SVector{d2, U}) where G1<:SGrid where G2<:CGrid{d2} where d where d2 where T where U
-    g1 = xa.grid.g1
-    g2 = xa.grid.g2
+    g1 = xa.grid.grids[1]
+    g2 = xa.grid.grids[2]
     dims = tuple(length(g1), (e[3] for e in g2.ranges)... )
     # ranges = tuple( (range(e...) for e in g2.ranges)... )
     # v = view(reshape(xa.data, dims),i,:)
@@ -90,7 +90,7 @@ end
 
 function (xa::GArray{PGrid{G1, G2, d}, T})(S::Tuple{Tuple{Int64}, U}) where G1 where G2 where U<:SVector where d where T
     #### TODO: replace
-    n_x =  ndims(xa.grid.g2)
+    n_x =  ndims(xa.grid.grids[2])
     V = S[2]
     n = length(V)
     s = SVector((V[i] for i=n-n_x+1:n)...)
diff --git a/src/grids.jl b/src/grids.jl
index 3d033802..3ccec258 100644
--- a/src/grids.jl
+++ b/src/grids.jl
@@ -54,13 +54,18 @@ end
 
 
 struct ProductGrid{G1, G2, d} <: AGrid{d}
-    g1::G1
-    g2::G2
-    # points::Vector{SVector{d, Float64}}
+    grids::Tuple{G1, G2}
 end
 
 const PGrid = ProductGrid
 
+function ProductGrid(A::AGrid{d1},B::AGrid{d2}) where d1 where d2 
+    println(d1)
+    println(d2)
+    ProductGrid{typeof(A), typeof(B), d1+d2}( (A,B) )
+end
+
+# functionProductGrid(A,B) = ProductGrid{typeof(A), typeof(B), ndims(A)+ndims(B)}((A,B))
 
 getindex(g::SGrid{d}, ::Colon) where d = g.points
 getindex(g::SGrid{d}, i::Int) where d = g.points[i]
@@ -72,25 +77,25 @@ cover(m,v::SVector{d,T}) where d where T = SVector{d,T}(
     (v[i] for i=length(m)+1:length(v))...
 )
 
-PGrid(g1::SGrid{n, d1}, g2::CGrid{d2}) where n where d1 where d2 = PGrid{typeof(g1), CGrid{d2}, d1+d2}(g1, g2)
+# PGrid(g1::SGrid{n, d1}, g2::CGrid{d2}) where n where d1 where d2 = PGrid{typeof(g1), CGrid{d2}, d1+d2}((g1, g2))
 cross(g1::SGrid{d1}, g2::CGrid{d2}) where d1 where d2 = PGrid(g1,g2)
 
 # another way to define multi-dimension cartesian grids
-PGrid(g1::CGrid{d1}, g2::CGrid{d2}) where d1 where d2 = PGrid{typeof(g1), CGrid{d2}, d1+d2}(g1, g2)
+# PGrid(g1::CGrid{d1}, g2::CGrid{d2}) where d1 where d2 = PGrid{typeof(g1), CGrid{d2}, d1+d2}(g1, g2)
 
 import Base: getindex
 
-from_linear(g::PGrid{G1, G2, d}, n) where G1 where G2 where d = let x=divrem(n-1, length(g.g1)); (x[2]+1, x[1]+1) end
+from_linear(g::PGrid{G1, G2, d}, n) where G1 where G2 where d = let x=divrem(n-1, length(g.grids[1])); (x[2]+1, x[1]+1) end
 
 getindex(g::PGrid{G1, G2, d}, n::Int) where G1 where G2 where d = getindex(g, from_linear(g, n)...)
 
 function getindex(g::PGrid{G1, G2, d}, i::Int64, j::Int64) where G1<:SGrid{d1} where G2<:CGrid{d2} where d where d1 where d2
-    SVector{d,Float64}(g.g1[i]..., g.g2[j]...)
+    SVector{d,Float64}(g.grids[1][i]..., g.grids[2][j]...)
 end
 
 
-getindex(g::PGrid{G1, G2, d}, i::Int64, ::Colon) where G1 where G2 where d = g.g2[:] # TODO: should error if i out of bounds
-getindex(g::PGrid{G1, G2, d}, ::Colon, i::Int64) where G1 where G2 where d = g.g1[:]
+getindex(g::PGrid{G1, G2, d}, i::Int64, ::Colon) where G1 where G2 where d = g.grids[2][:] # TODO: should error if i out of bounds
+getindex(g::PGrid{G1, G2, d}, ::Colon, i::Int64) where G1 where G2 where d = g.grids[1][:]
 
 @inline to__linear_index(g::CGrid{2}, ind::Tuple{Int64, Int64}) = let 
     i,j = ind
@@ -98,7 +103,7 @@ getindex(g::PGrid{G1, G2, d}, ::Colon, i::Int64) where G1 where G2 where d = g.g
     return i + p*(j-1)
 end
    
-@inline to__linear_index(g::PGrid, ind::Tuple{Int64, Int64}) =  ind[1] + length(g.g1)*(ind[2]-1)
+@inline to__linear_index(g::PGrid, ind::Tuple{Int64, Int64}) =  ind[1] + length(g.grids[1])*(ind[2]-1)
 
 
 show(io::IO, g::SGrid{d1, d2}) where d1 where d2 = print(io, "SGrid{$(d1)}")
@@ -109,7 +114,7 @@ show(io::IO, g::CGrid{d}) where d = let
 end
 
 
-show(io::IO, g::PGrid) = println(io, "$(g.g1)×$(g.g2)")
+show(io::IO, g::PGrid) = println(io, "$(g.grids[1])×$(g.grids[2])")
 
 import Base: iterate
 import Base: length
@@ -117,7 +122,7 @@ import Base: getindex
 import Base: setindex!
 
 
-length(pg::PGrid{G1, G2, d}) where G1 where G2 where d = length(pg.g1)*length(pg.g2)
+length(pg::PGrid{G1, G2, d}) where G1 where G2 where d = length(pg.grids[1])*length(pg.grids[2])
 length(sg::SGrid{d}) where d = length(sg.points)
 length(cg::CGrid{d}) where d = prod(e[3] for e in cg.ranges)
 
@@ -172,31 +177,31 @@ end
 
 
 function Base.iterate(g::PGrid{G1, G2, d}) where G1 where G2 where d
-    x = g.g1[1]
-    y = g.g2[1]
+    x = g.grids[1][1]
+    y = g.grids[2][1]
     return (SVector{d, Float64}(x...,y...),(y,1,1))
 end
 
 function Base.iterate(g::PGrid{G1,G2,d},state) where G1 where G2 where d
     y,i,j=state
-    if i<length(g.g1)
+    if i<length(g.grids[1])
         i += 1
-        x = g.g1[i]
+        x = g.grids[1][i]
         return (SVector{d,Float64}(x..., y...), (y,i,j))
     else
-        if j==length(g.g2)
+        if j==length(g.grids[2])
             return nothing
         else
             j += 1
             i = 1
-            x = g.g1[i]
-            y = g.g2[j]
+            x = g.grids[1][i]
+            y = g.grids[2][j]
             return (SVector{d,Float64}(x..., y...), (y,i,j))
         end
     end
 end
 
 enum(grid::PGrid) = (
-    QP((c[1],c[2]), grid[c...]) for c in Iterators.product(1:length(grid.g1), 1:length(grid.g2))
+    QP((c[1],c[2]), grid[c...]) for c in Iterators.product(1:length(grid.grids[1]), 1:length(grid.grids[2]))
 )
 

From ff1752914d57abcc7eabd0ef6b6281eb2481970c Mon Sep 17 00:00:00 2001
From: Pablo Winant <pablo.winant@gmail.com>
Date: Mon, 20 May 2024 19:43:30 +0200
Subject: [PATCH 05/17] WIP

---
 misc/dev_float32.jl         |  5 +++--
 misc/rbc_float32.jl         | 41 +++++++++++++++++++++++++++++--------
 src/algos/time_iteration.jl | 10 +++++++--
 src/funs.jl                 |  6 +++---
 src/garray.jl               | 18 ++++++++--------
 src/grids.jl                | 15 +++++++++-----
 src/splines/interp.jl       |  6 +++---
 src/splines/splines.jl      | 12 +++++------
 8 files changed, 76 insertions(+), 37 deletions(-)

diff --git a/misc/dev_float32.jl b/misc/dev_float32.jl
index e7f54566..f074c9d0 100644
--- a/misc/dev_float32.jl
+++ b/misc/dev_float32.jl
@@ -1,8 +1,9 @@
 using Dolo
 
 root_dir = pkgdir(Dolo)
-model = include("$(root_dir)/examples/ymodels/rbc_mc.jl")
+model32 = include("$(root_dir)/misc/rbc_float32.jl")
 
-dm = Dolo.discretize(model, Dict(:endo=>[10000000]) )
+
+dm32 = Dolo.discretize(model, Dict(:endo=>[10000000]) )
 
 Dolo.convert
\ No newline at end of file
diff --git a/misc/rbc_float32.jl b/misc/rbc_float32.jl
index 3958bab9..d6beddf4 100644
--- a/misc/rbc_float32.jl
+++ b/misc/rbc_float32.jl
@@ -56,22 +56,47 @@ model = let
 end
 
 
-dmodel = Dolo.discretize(model)
+# now these are orphans
+model32 = Dolo.convert_precision(Float32,model)
 
 
-model32 = Dolo.convert_precision(Float32,model)
-dmodel32 = Dolo.discretize(model32)
+function Dolo.transition(model::typeof(model32), s::NamedTuple, x::NamedTuple, M::NamedTuple)
+    
+    (;δ, ρ) = model.calibration
+    
+    # Z = e.Z
+    K = s.k * (1-δ) + x.i
 
-wksp = Dolo.time_iteration_workspace(dmodel32)
+    (;k=K,)  ## This is only the endogenous state
 
-itps = wksp.φ.itp
+end
 
-(;x0,r0, φ)  = wksp
 
-Dolo.F(dmodel32, x0, φ)
 
+function intermediate(model::typeof(model32),s::NamedTuple, x::NamedTuple)
+    
+    p = model.calibration
 
+	y = exp(s.z)*(s.k^p.α)*(x.n^(1-p.α))
+	w = (1-p.α)*y/x.n
+	rk = p.α*y/s.k
+	c = y - x.i
+	return ( (; y, c, rk, w))
+
+end
 
 
+function arbitrage(model::typeof(model32), s::NamedTuple, x::NamedTuple, S::NamedTuple, X::NamedTuple)
+
+    p = model.calibration
+
+	y = intermediate(model, s, x)
+	Y = intermediate(model, S, X)
+	res_1 = p.χ*(x.n^p.η)*(y.c^p.σ) - y.w
+	res_2 = (p.β*(y.c/Y.c)^p.σ)*(1 - p.δ + Y.rk) - 1
+    
+    return ( (;res_1, res_2) )
+
+end
 
-Dolo.time_iteration(model)
\ No newline at end of file
+model32
\ No newline at end of file
diff --git a/src/algos/time_iteration.jl b/src/algos/time_iteration.jl
index 3ecc7159..c8ec90ce 100644
--- a/src/algos/time_iteration.jl
+++ b/src/algos/time_iteration.jl
@@ -195,6 +195,7 @@ function time_iteration(model::DYModel,
     lam = 0.5
     
     local η_0 = NaN
+    local η
     convergence = false
     iterations = T
     
@@ -246,6 +247,7 @@ function time_iteration(model::DYModel,
             
             ε_n = norm(r0)
             if ε_n<tol_ε
+                iterations = t
                 break
             end
 
@@ -261,6 +263,7 @@ function time_iteration(model::DYModel,
 
                 ε_b = norm(r0)
                 if ε_b<ε_n
+                    iterations = t
                     break
                 end
             end
@@ -280,6 +283,10 @@ function time_iteration(model::DYModel,
 
         verbose ? append!(log; verbose=verbose, it=t-1, err=ε, sa=η_0, lam=gain, elapsed=elapsed) : nothing
 
+        if η < tol_η
+            iterations = t
+            break
+        end
         η_0 = η
 
 
@@ -314,7 +321,7 @@ function time_iteration(model::DYModel,
         φ,
         iterations,
         tol_η,
-        η_0,
+        η,
         log,
         ti_trace
     )
@@ -329,7 +336,6 @@ function newton(model, workspace=newton_workspace(model);
 
     (;x0, x1, x2, dx, r0, J, φ, T, memn) = workspace
 
-
     for t=1:K
         
         Dolo.fit!(φ, x0)
diff --git a/src/funs.jl b/src/funs.jl
index 84d01540..8be608ee 100644
--- a/src/funs.jl
+++ b/src/funs.jl
@@ -148,9 +148,9 @@ end
 
 # Compatibility calls
 
-(f::DFun)(x::Float64) = f(SVector(x))
-(f::DFun)(x::Float64, y::Float64) = f(SVector(x,y))
-(f::DFun)(x::Vector{SVector{d,Float64}}) where d = [f(e) for e in x]
+(f::DFun)(x::Real) = f(SVector(x))
+(f::DFun)(x::Real, y::Real) = f(SVector(x,y))
+(f::DFun)(x::Vector{SVector{d,<:Real}}) where d = [f(e) for e in x]
 
 
 ndims(df::DFun) = ndims(df.domain)
diff --git a/src/garray.jl b/src/garray.jl
index d89b7fe8..07648185 100644
--- a/src/garray.jl
+++ b/src/garray.jl
@@ -6,9 +6,9 @@ struct GArray{G,U}
 end
 
 const GVector{G,T} = GArray{G,T}
-const GDist{G} = GArray{G, Vector{Float64}}
+const GDist{G,Tf} = GArray{G, Vector{Tf}}
 
-GDist(g::G, v::Vector{Float64}) where G= GArray{G,Vector{Float64}}(g, v)
+GDist(g::G, v::Vector{Tf}) where G where Tf = GArray{G,Vector{Tf}}(g, v)
 
 
 norm(v::GArray) = maximum(u->maximum(abs, u), v.data)
@@ -45,7 +45,10 @@ end
 eltype(g::GArray{G,T}) where G where T = eltype(T)
 
 # warning: these functions don't copy any data
-ravel(g::GArray) = reinterpret(Float64, g.data)
+function ravel(g::GArray) 
+    Tf = eltype(g.data[1])
+    reinterpret(Tf, g.data)
+end
 unravel(g::GArray, x) = GArray(
     g.grid,
     reinterpret(eltype(g), x)
@@ -113,22 +116,21 @@ import Base: *, \, +, -, /
 *(x::Number, A::GArray{G,T}) where G where T = GArray(A.grid, x .* A.data)
 
 
-*(A::GArray{G,Vector{T}}, x::SVector{q, Float64}) where G where T <:SMatrix{p, q, Float64, n}  where p where q where n = GArray(A.grid, [M*x for M in A.data])
-# *(A::GArray{G,Vector{T}}, x::SLArray{Tuple{q}, Float64, 1, q, names}) where G where T <:SMatrix{p, q, Float64, n}  where p where q where n where names = A*SVector(x...)
+*(A::GArray{G,Vector{T}}, x::SVector{q, Tf}) where G where T <:SMatrix{p, q, Tf, n}  where p where q where n where Tf = GArray(A.grid, [M*x for M in A.data])
 
 
-*(A::GArray{G,T}, B::AbstractArray{Float64}) where G where T <:SMatrix{p, q, Float64, n}  where p where q where n = 
+*(A::GArray{G,T}, B::AbstractArray{Tf}) where G where T <:SMatrix{p, q, Tf, n}  where p where q where n where Tf = 
     ravel(
         GArray(
             A.grid,
-            A.data .* reinterpret(SVector{q, Float64}, B)
+            A.data .* reinterpret(SVector{q, Tf}, B)
         )
     )
 
 
 import Base: convert
 
-function Base.convert(::Type{Matrix}, A::GArray{G,Vector{T}}) where G where T <:SMatrix{p, q, Float64, k}  where p where q where k
+function Base.convert(::Type{Matrix}, A::GArray{G,Vector{T}}) where G where T <:SMatrix{p, q, Tf, k}  where p where q where k where Tf
     N = length(A.data)
     n0 = N*p
     n1 = N*q
diff --git a/src/grids.jl b/src/grids.jl
index 3ccec258..07062c74 100644
--- a/src/grids.jl
+++ b/src/grids.jl
@@ -3,7 +3,7 @@ abstract type ASGrid{d} <: AGrid{d} end
 
 import Base: eltype, iterate, size
 
-eltype(cg::AGrid{d}) where d = SVector{d, Float64}
+# eltype(cg::AGrid{d}) where d = SVector{d, Float64}
 ndims(cg::AGrid{d}) where d = d
 
 struct CGrid{d,Tf} <: AGrid{d}
@@ -90,7 +90,8 @@ from_linear(g::PGrid{G1, G2, d}, n) where G1 where G2 where d = let x=divrem(n-1
 getindex(g::PGrid{G1, G2, d}, n::Int) where G1 where G2 where d = getindex(g, from_linear(g, n)...)
 
 function getindex(g::PGrid{G1, G2, d}, i::Int64, j::Int64) where G1<:SGrid{d1} where G2<:CGrid{d2} where d where d1 where d2
-    SVector{d,Float64}(g.grids[1][i]..., g.grids[2][j]...)
+    Tf = eltype(g)
+    SVector{d,Tf}(g.grids[1][i]..., g.grids[2][j]...)
 end
 
 
@@ -177,17 +178,21 @@ end
 
 
 function Base.iterate(g::PGrid{G1, G2, d}) where G1 where G2 where d
+    T = eltype(g)
     x = g.grids[1][1]
     y = g.grids[2][1]
-    return (SVector{d, Float64}(x...,y...),(y,1,1))
+    return (SVector{d, T}(x...,y...),(y,1,1))
 end
 
 function Base.iterate(g::PGrid{G1,G2,d},state) where G1 where G2 where d
+
+    T = eltype(g)
+
     y,i,j=state
     if i<length(g.grids[1])
         i += 1
         x = g.grids[1][i]
-        return (SVector{d,Float64}(x..., y...), (y,i,j))
+        return (SVector{d,T}(x..., y...), (y,i,j))
     else
         if j==length(g.grids[2])
             return nothing
@@ -196,7 +201,7 @@ function Base.iterate(g::PGrid{G1,G2,d},state) where G1 where G2 where d
             i = 1
             x = g.grids[1][i]
             y = g.grids[2][j]
-            return (SVector{d,Float64}(x..., y...), (y,i,j))
+            return (SVector{d,T}(x..., y...), (y,i,j))
         end
     end
 end
diff --git a/src/splines/interp.jl b/src/splines/interp.jl
index d3b4974e..9b2abfb3 100644
--- a/src/splines/interp.jl
+++ b/src/splines/interp.jl
@@ -2,7 +2,7 @@ using StaticArrays
 import LoopVectorization: VectorizationBase
 import Base: getindex
 
-getindex(A::Vector{Float64}, i::VectorizationBase.Vec{4,Int64}) = VectorizationBase.Vec{4, Float64}(A[i(1)], A[i(2)], A[i(3)], A[i(4)])
+getindex(A::Vector{Tf}, i::VectorizationBase.Vec{4,Int64}) where Tf = VectorizationBase.Vec{4, Tf}(A[i(1)], A[i(2)], A[i(3)], A[i(4)])
 
 
 # ## TODO : rewrite the following
@@ -44,7 +44,7 @@ matextract(v::AbstractArray{T,1}, i) where T = SArray{Tuple{2}, T, 1, 2}(
     v[i+1]
 )
 
-function interp(ranges::NTuple{d, Tuple{Float64, Float64, Int64}}, values::AbstractArray{T,d}, x::SVector{d, U}) where d where T where U
+function interp(ranges::NTuple{d, Tuple{Tf, Tf, Int64}}, values::AbstractArray{T,d}, x::SVector{d, U}) where d where T where U where Tf
     
     a = SVector( (e[1] for e in ranges)... )
     b = SVector( (e[2] for e in ranges)... )
@@ -67,7 +67,7 @@ function interp(ranges::NTuple{d, Tuple{Float64, Float64, Int64}}, values::Abstr
 
 end
 
-function interp(ranges::NTuple{d, Tuple{Float64, Float64, Int64}}, values::AbstractArray{T,d}, x::Vararg{U}) where d where T where U
+function interp(ranges::NTuple{d, Tuple{Tf, Tf, Int64}}, values::AbstractArray{T,d}, x::Vararg{U}) where d where T where U where Tf
     xx = SVector(x...)
     interp(ranges, values, xx)
 end
diff --git a/src/splines/splines.jl b/src/splines/splines.jl
index f77b98a3..deb4fec1 100644
--- a/src/splines/splines.jl
+++ b/src/splines/splines.jl
@@ -14,11 +14,11 @@ function interpolant_cspline(a, b, orders, V)
 
     coefs = filter_coeffs(a, b, orders, V)
 
-    function fun(s::Array{Float64,2})
+    function fun(s::Array{Tf,2}) where Tf
         return eval_UC_spline(a, b, orders, coefs, s)
     end
 
-    function fun(p::Float64...)
+    function fun(p::Tf...) where Tf
         return fun([p...]')
     end
 
@@ -45,7 +45,7 @@ end
 
 
 
-    function prefilter(ranges::NTuple{d,Tuple{Float64,Float64,i}}, V::AbstractArray{T, d}, ::Val{3}) where d where i<:Int where T
+    function prefilter(ranges::NTuple{d,Tuple{Tf,Tf,i}}, V::AbstractArray{T, d}, ::Val{3}) where d where i<:Int where T where Tf
         θ = zeros(eltype(V), ((e[3]+2) for e in ranges)...)
         ind = tuple( (2:(e[3]+1) for e in ranges )...)
         θ[ind...] = V
@@ -54,17 +54,17 @@ end
     end
 
 
-    function prefilter!(θ::AbstractArray{T, d}, grid::NTuple{d,Tuple{Float64,Float64,i}}, V::AbstractArray{T, d}, ::Val{3}) where d where i<:Int where T
+    function prefilter!(θ::AbstractArray{T, d}, grid::NTuple{d,Tuple{Tf,Tf,i}}, V::AbstractArray{T, d}, ::Val{3}) where d where i<:Int where T where Tf
         splines.prefilter!(θ)
     end
 
-    function prefilter(ranges::NTuple{d,Tuple{Float64,Float64,i}}, V::AbstractArray{T, d}, ::Val{1}) where d where i<:Int where T
+    function prefilter(ranges::NTuple{d,Tuple{Tf,Tf,i}}, V::AbstractArray{T, d}, ::Val{1}) where d where i<:Int where T where Tf
         θ = copy(V)
         return θ
     end
 
 
-    function prefilter!(θ::AbstractArray{T, d}, grid::NTuple{d,Tuple{Float64,Float64,i}}, V::AbstractArray{T, d}, ::Val{1}) where d where i<:Int where T
+    function prefilter!(θ::AbstractArray{T, d}, grid::NTuple{d,Tuple{Tf,Tf,i}}, V::AbstractArray{T, d}, ::Val{1}) where d where i<:Int where T where Tf
         θ .= V
     end
 

From 9a673ca9a8bf24e6b8650359052546d6a774b0fd Mon Sep 17 00:00:00 2001
From: Pablo Winant <pablo.winant@gmail.com>
Date: Sun, 26 May 2024 16:25:49 +0200
Subject: [PATCH 06/17] WIP

---
 Project.toml                            |   1 +
 misc/dev_float32.jl                     |  92 ++++++-
 misc/test_gpu.jl                        |  14 +-
 src/Dolo.jl                             |   1 +
 src/adapt.jl                            |  36 +++
 src/algos/time_iteration.jl             |  20 +-
 src/algos/time_iteration_accelerated.jl |   6 +-
 src/grids.jl                            |  16 +-
 src/splines/csplines.jl                 |  12 +-
 src/splines/interp.jl                   |   3 +-
 src/splines/interp_misc.jl              | 347 ++++++++++++++++++++++++
 src/splines/macros.jl                   |  70 +++--
 12 files changed, 562 insertions(+), 56 deletions(-)
 create mode 100644 src/adapt.jl
 create mode 100644 src/splines/interp_misc.jl

diff --git a/Project.toml b/Project.toml
index 971d1891..fbd43119 100644
--- a/Project.toml
+++ b/Project.toml
@@ -11,6 +11,7 @@ BlockDiagonals = "0a1fb500-61f7-11e9-3c65-f5ef3456f9f0"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Continuables = "79afa230-ca09-11e8-120b-5decf7bf5e25"
 Crayons = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
+Cthulhu = "f68482b8-f384-11e8-15f7-abe071a5a75f"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
diff --git a/misc/dev_float32.jl b/misc/dev_float32.jl
index f074c9d0..45e3fc08 100644
--- a/misc/dev_float32.jl
+++ b/misc/dev_float32.jl
@@ -1,9 +1,95 @@
 using Dolo
 
 root_dir = pkgdir(Dolo)
-model32 = include("$(root_dir)/misc/rbc_float32.jl")
+model_32 = include("$(root_dir)/misc/rbc_float32.jl")
 
 
-dm32 = Dolo.discretize(model, Dict(:endo=>[10000000]) )
+model32 = Dolo.convert_precision(Float32, model_32)
 
-Dolo.convert
\ No newline at end of file
+dm32 = Dolo.discretize(model32, Dict(:endo=>[100]) )
+
+
+typeof(dm32)
+
+using Adapt
+
+
+wk0 = Dolo.time_iteration_workspace(dm32)
+
+import oneAPI: oneArray
+import Adapt: adapt_structure
+
+wk = adapt(oneArray, wk0)
+using KernelAbstractions: get_backend
+
+if typeof(dm32.grid)<:Dolo.CGrid
+    p = dm32.grid.ranges[1][3]
+    q = dm32.grid.ranges[2][3]
+else
+    p = length(dm32.grid.grids[1])
+    q = length(dm32.grid.grids[2])
+end
+
+t_e = get_backend(wk.x0)
+
+using KernelAbstractions
+using StaticArrays
+
+@kernel function ggg(r, dm, x, φ)
+# @kernel function fun(dm, r, x)
+
+
+    n = @index(Global, Linear)
+    ind = @index(Global, Cartesian)
+    (i,j) = ind.I
+
+
+    # k = length(dm.grid.grids[1])
+    # i_ = (n-1)÷k
+    # j_ = (n-1) - (i)*κ
+    # i = i_+1
+    # j = j_+1
+    
+    # TODO: special function here
+    s_ = dm.grid[n]
+    s = Dolo.QP((i,j), s_)
+
+    xx = x[n]
+
+    # (i,j) = @inline Dolo.from_linear(model.grid, n)    
+
+    rr = Dolo.F(dm, s, xx, φ)
+
+    r[n] = rr
+
+end
+
+
+
+fun_ = ggg(t_e)
+@time fun_(wk.r0, dm32, wk.x0, wk.φ; ndrange=(p,q))
+
+
+
+@time Dolo.F!(wk0.r0, dm32, wk0.x0, wk0.φ)
+
+# try
+
+    # res = fun_(wk.r0, dm32, wk.x0, wk.φ; ndrange=(p,q))
+# catch err
+    # nothing
+# end
+
+
+
+# Dolo.time_iteration(dm32, wk; engine=:gpu)
+
+
+s_0 = dm32.grid[10]
+s0 = Dolo.QP((2,2), s_0)
+xx0 = wk0.x0[10]
+
+
+fon(dm32, s0, xx0) = sum(w*S.val for (w,S) in Dolo.τ(dm32, s0, xx0))
+
+@code_warntype fon(dm32, s0, xx0)
\ No newline at end of file
diff --git a/misc/test_gpu.jl b/misc/test_gpu.jl
index c41c933e..b9eaa033 100644
--- a/misc/test_gpu.jl
+++ b/misc/test_gpu.jl
@@ -107,12 +107,6 @@ spl = Dolo.SplineInterpolator{typeof(grid), typeof(C), 3}(grid, C)
 # end
 
 
-adapt_structure(to, s::Dolo.SplineInterpolator{G,C,k}) where G where C where k = let
-    θθ = adapt(to, s.θ)
-    CC = typeof(θθ)
-    Dolo.SplineInterpolator{G,CC,k}(s.grid, θθ)
-end
-
 
 gspl = adapt(CuArray, spl)
 
@@ -170,6 +164,14 @@ Dolo.calibrated(model, :controls)
 
 import Adapt: adapt_storage
 
+
+
+adapt_structure(to, s::Dolo.SplineInterpolator{G,C,k}) where G where C where k = let
+    θθ = adapt(to, s.θ)
+    CC = typeof(θθ)
+    Dolo.SplineInterpolator{G,CC,k}(s.grid, θθ)
+end
+
 function adapt_structure(to, v::Dolo.GVector{G,V}) where G where V
     data = adapt(to, v.data)
     Dolo.GVector{G, typeof(data)}(
diff --git a/src/Dolo.jl b/src/Dolo.jl
index 75ac7a0d..b460bdc0 100755
--- a/src/Dolo.jl
+++ b/src/Dolo.jl
@@ -83,6 +83,7 @@ module Dolo
     include("algos/vfi.jl")
 
 
+    include("adapt.jl")
     include("utils.jl")
 
     # function yaml_import(filename)
diff --git a/src/adapt.jl b/src/adapt.jl
new file mode 100644
index 00000000..10362ef8
--- /dev/null
+++ b/src/adapt.jl
@@ -0,0 +1,36 @@
+import Adapt: adapt, adapt_structure
+
+adapt_structure(to, s::SplineInterpolator{G,C,k}) where G where C where k = let
+    θθ = adapt(to, s.θ)
+    CC = typeof(θθ)
+    SplineInterpolator{G,CC,k}(s.grid, θθ)
+end
+
+function adapt_structure(to, v::GVector{G,V}) where G where V
+    data = adapt(to, v.data)
+    GVector{G, typeof(data)}(
+        v.grid,
+        data
+    )
+end
+
+function adapt_structure(to, v::GArray{G,V}) where G where V
+    data = adapt(to, v.data)
+    GArray(
+        v.grid,
+        data
+    )
+end
+function adapt_structure(to, f::DFun{Dom, Gar, Itp, vars}) where Dom where Gar where Itp where vars
+    itp = adapt(to, f.itp)
+    values = adapt(to, f.values)
+    DFun{Dom, typeof(values), typeof(itp), vars}(
+        f.domain,
+        values,
+        itp
+    )
+end
+
+import KernelAbstractions: get_backend
+
+get_backend(g::GArray) = get_backend(g.data)
diff --git a/src/algos/time_iteration.jl b/src/algos/time_iteration.jl
index c8ec90ce..f96a18bf 100644
--- a/src/algos/time_iteration.jl
+++ b/src/algos/time_iteration.jl
@@ -126,9 +126,9 @@ end
 
 
 using LinearMaps
+using Adapt
 
-
-function time_iteration_workspace(dmodel; interp_mode=:linear)
+function time_iteration_workspace(dmodel; interp_mode=:linear, dest=Array)
 
     T = eltype(dmodel)
 
@@ -145,7 +145,10 @@ function time_iteration_workspace(dmodel; interp_mode=:linear)
     )
     vars = variables(dmodel.model.controls)
     φ = DFun(dmodel.model.states, x0, vars; interp_mode=interp_mode)
-    return (;x0, x1, x2, r0, dx, J, φ)
+
+    tt = (;x0, x1, x2, r0, dx, J, φ)
+
+    return adapt(dest, tt)
 
 end
 
@@ -199,16 +202,14 @@ function time_iteration(model::DYModel,
     convergence = false
     iterations = T
     
-    if engine==:cpu
-        t_engine = CPU()
-    elseif engine==:gpu
-        t_engine = GPU()
+    (;x0, x1, x2, dx, r0, J, φ) = workspace
+
+    if engine in (:cpu, :gpu)
+        t_engine = get_backend(workspace.x0)
     else
         t_engine = nothing
     end
 
-    (;x0, x1, x2, dx, r0, J, φ) = workspace
-
     ti_trace = trace ? IterationTrace(typeof(φ)[]) : nothing
 
     if improve
@@ -224,7 +225,6 @@ function time_iteration(model::DYModel,
         trace && push!(ti_trace.data, deepcopy(φ))
 
         F!(r0, model, x0, φ, t_engine)
-    
         # r0 = F(model, x0, φ)
 
         ε = norm(r0)
diff --git a/src/algos/time_iteration_accelerated.jl b/src/algos/time_iteration_accelerated.jl
index 1f085ef0..a4ca41f4 100644
--- a/src/algos/time_iteration_accelerated.jl
+++ b/src/algos/time_iteration_accelerated.jl
@@ -37,14 +37,12 @@ using KernelAbstractions
 # end
 
 # This is for a PGrid model only 
-function F!(r, model, x, φ, engine::Union{CPU, GPU})
+function F!(r, model, x, φ, engine)
 
     @kernel function FF_(r, @Const(model), @Const(x), @Const(φ))
 
-        # c = @index(Global, Cartesian)
-        # c = @index(Global, Cartesian)
+
         n = @index(Global, Linear)
-        # i,j = c.I
         (i,j) = Dolo.from_linear(model.grid, n)
 
         s_ = model.grid[n]
diff --git a/src/grids.jl b/src/grids.jl
index 07062c74..c4062159 100644
--- a/src/grids.jl
+++ b/src/grids.jl
@@ -15,7 +15,7 @@ size(cg::CGrid{d}) where d = tuple((e[3] for e in cg.ranges)... )
 
 getindex(g::CGrid{d}, inds::Vararg{Int64,d}) where d = SVector{d}(
     ( 
-        ( g.ranges[i][1] + (g.ranges[i][2]-g.ranges[i][1])*( (inds[i]-1)/(g.ranges[i][3]-1)) )
+        ( g.ranges[i][1] + (g.ranges[i][2]-g.ranges[i][1])*( (inds[i]-1)/(g.ranges[i][3]-convert(eltype(eltype(g)),1))) )
         for i=1:d
     )
 )
@@ -85,13 +85,17 @@ cross(g1::SGrid{d1}, g2::CGrid{d2}) where d1 where d2 = PGrid(g1,g2)
 
 import Base: getindex
 
-from_linear(g::PGrid{G1, G2, d}, n) where G1 where G2 where d = let x=divrem(n-1, length(g.grids[1])); (x[2]+1, x[1]+1) end
+@inline from_linear(g::PGrid{G1, G2, d}, n::Int) where G1 where G2 where d = let x=divrem(n-1, length(g.grids[1])); (x[2]+1, x[1]+1) end
 
-getindex(g::PGrid{G1, G2, d}, n::Int) where G1 where G2 where d = getindex(g, from_linear(g, n)...)
+@inline getindex(g::PGrid{G1, G2, d}, n::Int) where G1 where G2 where d = getindex(g, from_linear(g, n)...)
 
-function getindex(g::PGrid{G1, G2, d}, i::Int64, j::Int64) where G1<:SGrid{d1} where G2<:CGrid{d2} where d where d1 where d2
-    Tf = eltype(g)
-    SVector{d,Tf}(g.grids[1][i]..., g.grids[2][j]...)
+# eltype(cg::CGrid{d,Tf}) where d where Tf = SVector{d, Tf}
+
+
+@inline function getindex(g::PGrid{G1, G2, d}, i::Int64, j::Int64) where G1<:SGrid{d1} where G2<:CGrid{d2} where d where d1 where d2
+    # Tf = eltype(g)
+    # SVector{d,Tf}(g.grids[1][i]..., g.grids[2][j]...)
+    SVector(g.grids[1][i]..., g.grids[2][j]...)
 end
 
 
diff --git a/src/splines/csplines.jl b/src/splines/csplines.jl
index 75a40a91..cdcf0ae3 100644
--- a/src/splines/csplines.jl
+++ b/src/splines/csplines.jl
@@ -4,21 +4,21 @@ using StaticArrays
 
 
 # old style call
-function eval_UC_spline(a, b, orders, C::AbstractArray{Float64}, S::Matrix{Float64})
+function eval_UC_spline(a, b, orders, C::AbstractArray{Tf}, S::Matrix{Tf}) where Tf
     d,N = size(S)
 
     n_x = size(C,1)
     dims = tuple(size(C)[2:end]...)
     out = zeros(n_x,N)
-    CC = reshape(reinterpret(SVector{n_x,Float64},vec(C)),dims)
-    SS = reshape(reinterpret(SVector{d,Float64},vec(S)),(N,))
-    V  = reshape(reinterpret(SVector{n_x,Float64},vec(out)),(N,))
+    CC = reshape(reinterpret(SVector{n_x,Tf},vec(C)),dims)
+    SS = reshape(reinterpret(SVector{d,Tf},vec(S)),(N,))
+    V  = reshape(reinterpret(SVector{n_x,Tf},vec(out)),(N,))
     eval_UC_spline!(a, b, orders, CC, SS, V)
     return out
 end
 
 
-function eval_UC_spline(a, b, orders, C::AbstractArray{T, d}, S::Vector{SVector{d,Float64}}) where T where d
+function eval_UC_spline(a, b, orders, C::AbstractArray{T, d}, S::Vector{SVector{d,Tf}}) where T where d where Tf
     N = length(S)
     vals = zeros(T,N)
     eval_UC_spline!(a, b, orders, C, S, vals)
@@ -26,7 +26,7 @@ function eval_UC_spline(a, b, orders, C::AbstractArray{T, d}, S::Vector{SVector
 end
 
 
-@generated function eval_UC_spline!(a, b, orders, C::AbstractArray{T, d}, S::AbstractVector{SVector{d, Float64}}, V::AbstractVector{T}) where d where T
+@generated function eval_UC_spline!(a, b, orders, C::AbstractArray{T, d}, S::AbstractVector{SVector{d,Tf}}, V::AbstractVector{T}) where d where T where Tf
     # d = C.parameters[2]-1 # first dimension of C indexes the splines
     # the speed penalty of extrapolating points when iterating over point
     # seems very small so this is the default
diff --git a/src/splines/interp.jl b/src/splines/interp.jl
index 9b2abfb3..c68a8f1d 100644
--- a/src/splines/interp.jl
+++ b/src/splines/interp.jl
@@ -58,7 +58,8 @@ function interp(ranges::NTuple{d, Tuple{Tf, Tf, Int64}}, values::AbstractArray{T
 
     λ = (x.-(a .+ δ.*i))./δ
 
-    i_ = floor.(Int, i) .+ 1
+    # i_ = floor.(Int, i) .+ 1
+    i_ = unsafe_trunc.(Int, i) .+ 1
 
     M = matextract(values, i_...)
     
diff --git a/src/splines/interp_misc.jl b/src/splines/interp_misc.jl
new file mode 100644
index 00000000..8339b170
--- /dev/null
+++ b/src/splines/interp_misc.jl
@@ -0,0 +1,347 @@
+using StaticArrays
+import LoopVectorization: VectorizationBase
+import Base: getindex
+
+getindex(A::Vector{Float64}, i::VectorizationBase.Vec{4,Int64}) = VectorizationBase.Vec{4, Float64}(A[i(1)], A[i(2)], A[i(3)], A[i(4)])
+
+
+# ## TODO : rewrite the following
+# it is currently slower than the custom versions below
+
+@inline function reduce_tensors(λ::SVector{d1,U}, v::SArray{T,V,d2,W}) where d1 where d2 where T where U where V where W
+    vv = tuple( (SVector(1-e, e) for e in λ)...) 
+    xx = SVector( (prod(e) for e in Iterators.product(vv...))...)
+    return sum( xx .* v[:])
+end
+
+matextract(v::AbstractArray{T,3}, i,j,k) where T = SArray{Tuple{2, 2, 2}, T, 3, 8}(
+    v[i,  j  ,k],
+    v[i+1,j  ,k],
+    v[i  ,j+1,k],
+    v[i+1,j+1,k],
+    v[i  ,j  ,k+1],
+    v[i+1,j  ,k+1],
+    v[i  ,j+1,k+1],
+    v[i+1,j+1,k+1]
+)
+
+matextract(v::AbstractArray{T,2}, i, j) where T = SArray{Tuple{2,2}, T, 2, 4}(
+    v[i,j],
+    v[i+1,j],
+    v[i,j+1],
+    v[i+1,j+1]
+)
+matextract(v::AbstractArray{T,1}, i) where T = SArray{Tuple{2}, T, 1, 2}(
+    v[i],
+    v[i+1]
+)
+
+function interp(ranges::NTuple{d, Tuple{Float64, Float64, Int64}}, values::AbstractArray{T,d}, x::SVector{d, U}) where d where T where U
+    
+    a = SVector( (e[1] for e in ranges)... )
+    b = SVector( (e[2] for e in ranges)... )
+    n = SVector( (e[3] for e in ranges)... )
+
+    δ = (b.-a)./(n.-1)
+
+    i = div.( (x.-a), δ)
+
+    i = max.(min.(i, n.-2), 0)
+
+    λ = (x.-(a .+ δ.*i))./δ
+
+    i_ = floor.(Int, i) .+ 1
+
+    # inds = tuple( (SVector(e,e+1) for e in i_)...)
+    # return reduce_tensors( λ, values[inds...] )
+
+    return reduce_tensors( λ, matextract(values, i_...) )
+
+end
+
+function interp(ranges::NTuple{d, Tuple{Float64, Float64, Int64}}, values::AbstractArray{T,d}, x::Vararg{U}) where d where T where U
+    xx = SVector(x...)
+    interp(ranges, values, xx)
+end
+
+# @inline function reduce_tensors(λ::SVector{3,U}, v::SArray{T,Float64,3,V}) where d1 where d2 where T where U where V
+#     (1.0 - λ[3]) * (             
+#         (1.0 - λ[2])*(
+#             (1.0 - λ[1])*v[1,1,1] + λ[1]*v[2,1,1]
+#         ) + λ[2] * (
+#             (1.0 - λ[1])*v[1,2,1] + λ[1]*v[2,2,1]
+#         )
+#     )
+#     + λ[3] * (
+#         (1.0 - λ[2])*(
+#             (1.0 - λ[1])*v[1,1,2] + λ[1]*v[2,1,2]
+#         ) + λ[2] * (
+#             (1.0 - λ[1])*v[1,2,2] + λ[1]*v[2,2,2]
+#         )
+#     )
+# end
+
+
+
+
+# @inline function interp(ranges::Tuple{Tuple{Float64, Float64, Int64}}, values::AbstractArray{T}, x::U) where T where U
+# # function interp(ranges::Tuple{Tuple{Float64, Float64, Int64}}, values, x) where T where U 
+
+#     a = (ranges[1][1])
+#     b = (ranges[1][2])
+#     n = (ranges[1][3])
+
+#     δ = (b-a)/(n-1)
+
+#     i = div( (x-a), δ)
+
+#     i = max(min(i, n-2), 0)
+
+#     λ = (x-(a + δ*i))/δ
+
+#     i_ = floor(Int, i) + 1
+
+#     v0 = values[i_]
+#     v1 = values[i_+1]
+
+#     (1.0 - λ)*v0 + λ*v1
+
+# end
+
+# @inline function interp(ranges::Tuple{Tuple{Float64, Float64, Int64},Tuple{Float64, Float64, Int64}}, values::AbstractArray{T}, x_1::U, x_2::U) where T where U
+#     # function interp(ranges::Tuple{Tuple{Float64, Float64, Int64}}, values, x) where T where U 
+    
+#         a_1 = (ranges[1][1])
+#         b_1 = (ranges[1][2])
+#         n_1 = (ranges[1][3])
+    
+#         a_2 = (ranges[2][1])
+#         b_2 = (ranges[2][2])
+#         n_2 = (ranges[2][3])
+
+#         δ_1 = (b_1-a_1)/(n_1-1)
+#         δ_2 = (b_2-a_2)/(n_2-1)
+    
+#         i_1 = div( (x_1-a_1), δ_1)
+#         i_2 = div( (x_2-a_2), δ_2)
+    
+#         i_1 = max(min(i_1, n_1-2), 0)
+#         i_2 = max(min(i_2, n_2-2), 0)
+    
+#         λ_1 = (x_1-(a_1 + δ_1*i_1))/δ_1
+#         λ_2 = (x_2-(a_2 + δ_2*i_2))/δ_2
+    
+#         i_1_ = floor(Int, i_1) + 1
+#         i_2_ = floor(Int, i_2) + 1
+
+#         v00 = values[i_1_, i_2_]
+#         v01 = values[i_1_, i_2_+1]
+#         v10 = values[i_1_+1, i_2_]
+#         v11 = values[i_1_+1, i_2_+1]
+    
+#         res = (1.0 - λ_2)*(
+#             (1.0 - λ_1)*v00 + λ_1*v10
+#         ) + λ_2 * (
+#             (1.0 - λ_1)*v01 + λ_1*v11
+#         )
+    
+#         return res
+
+# end
+
+
+# @inline function interp(ranges::Tuple{
+#             Tuple{Float64, Float64, Int64},
+#             Tuple{Float64, Float64, Int64},
+#             Tuple{Float64, Float64, Int64}
+#         }, values::AbstractArray{T}, x_1::U, x_2::U, x_3::U) where T where U
+    
+#         a_1 = (ranges[1][1])
+#         b_1 = (ranges[1][2])
+#         n_1 = (ranges[1][3])
+    
+#         a_2 = (ranges[2][1])
+#         b_2 = (ranges[2][2])
+#         n_2 = (ranges[2][3])
+
+#         a_3 = (ranges[3][1])
+#         b_3 = (ranges[3][2])
+#         n_3 = (ranges[3][3])
+
+#         δ_1 = (b_1-a_1)/(n_1-1)
+#         δ_2 = (b_2-a_2)/(n_2-1)
+#         δ_3 = (b_3-a_3)/(n_3-1)
+    
+#         i_1 = div( (x_1-a_1), δ_1)
+#         i_2 = div( (x_2-a_2), δ_2)
+#         i_3 = div( (x_3-a_3), δ_3)
+
+
+#         i_1 = max(min(i_1, n_1-2), 0)
+#         i_2 = max(min(i_2, n_2-2), 0)
+#         i_3 = max(min(i_3, n_3-2), 0)
+    
+#         λ_1 = (x_1-(a_1 + δ_1*i_1))/δ_1
+#         λ_2 = (x_2-(a_2 + δ_2*i_2))/δ_2
+#         λ_3 = (x_3-(a_3 + δ_3*i_3))/δ_3
+    
+#         i_1_ = floor(Int, i_1) + 1
+#         i_2_ = floor(Int, i_2) + 1
+#         i_3_ = floor(Int, i_3) + 1
+
+#         v000 = values[i_1_,   i_2_,   i_3_]
+#         v001 = values[i_1_,   i_2_,   i_3_+1]
+#         v010 = values[i_1_,   i_2_+1, i_3_]
+#         v011 = values[i_1_,   i_2_+1, i_3_+1]
+#         v100 = values[i_1_+1, i_2_,   i_3_]
+#         v101 = values[i_1_+1, i_2_,   i_3_+1]
+#         v110 = values[i_1_+1, i_2_+1, i_3_]
+#         v111 = values[i_1_+1, i_2_+1, i_3_+1]
+
+
+#         res = (1.0 - λ_3) * (             
+#                 (1.0 - λ_2)*(
+#                     (1.0 - λ_1)*v000 + λ_1*v100
+#                 ) + λ_2 * (
+#                     (1.0 - λ_1)*v010 + λ_1*v110
+#                 )
+#             )
+#             + λ_3 * (
+#                 (1.0 - λ_2)*(
+#                     (1.0 - λ_1)*v001 + λ_1*v101
+#                 ) + λ_2 * (
+#                     (1.0 - λ_1)*v011 + λ_1*v111
+#                 )
+#             )
+
+#         return res
+
+# end
+
+
+function vecinterp_1(ranges, values::Vector{T}, x::Vector{<:Number}) where T
+    N = length(x)
+    out = zeros(T,N)
+    for n=1:N
+        out[n] = interp(ranges, values, x[n])
+    end
+    return out
+end
+
+
+function vecinterp_2(ranges, values::Vector{T}, x::Vector{<:Number}) where T
+    N = length(x)
+    out = zeros(T,N)
+    out .= (u->interp(ranges, values, u)).(x)
+    return out
+end
+
+using LoopVectorization
+
+function vecinterp_3(ranges, values::Vector{T}, x::Vector{<:Number}) where T
+    N = length(x)
+    out = zeros(T,N)
+    @simd for n=1:N
+        out[n] = interp(ranges, values, x[n])
+    end
+    return out
+end
+
+
+using LoopVectorization
+
+function vecinterp_4(ranges, values::Vector{T}, X::Vector{<:Number}) where T
+    
+    K = length(X)
+    out = zeros(T,K)
+
+    a = (ranges[1][1])
+    b = (ranges[1][2])
+    N = (ranges[1][3])
+
+    δ = (b-a)/(N-1)
+
+    @turbo for n=1:K
+
+        x = X[n]
+
+        i = div( (x-a), δ)
+
+        i = max(min(i, N-2), 0)
+
+        λ = (x-(a + δ*i))/δ
+
+        i_ = floor(Int, i) + 1
+
+        v0 = values[i_]
+        v1 = values[i_+1]
+        # v0 = getindex(values, i_)
+        # v1 = getindex(values, i_+1)
+
+        out[n] = (1-λ)*v0 + λ*v1
+    end
+    return out
+end
+
+function vecinterp_5(ranges, values::Vector{T}, x::Vector{<:Number}) where T
+    N = length(x)
+    out = zeros(T,N)
+    @turbo for n=1:N
+        ret = interp(ranges, values, x[n])
+        out[n] = ret
+    end
+    return out
+end
+
+
+
+
+
+# @code_warntype interp(ranges, values, 2.0)
+
+# xvec = [range(-0.1, 2.1; length=1000)...]
+# tvec = xvec .^ 2
+
+# yvec = [(interp(ranges, values, x)) for x in xvec]
+
+
+# using ForwardDiff
+
+# ForwardDiff.jacobian(u->[interp(ranges, values, u[1])], [0.3])
+
+# using Plots
+
+# plot(xvec, tvec)
+# plot!(xvec, yvec)
+
+# using LoopVectorization: VectorizationBase
+
+
+# a,b,N = ranges[1]
+# x = VectorizationBase.Vec{4, Float64}(4, 3, 2, 5)
+# δ = (b-a)/(N-1)
+
+# i = div( (x-a), δ)
+
+# i = max(min(i, N-2), 0)
+
+# λ = (x-(a + δ*i))/δ
+
+# i_ = floor(Int, i) + 1
+
+# v0 = values[i_]
+# v1 = values[i_+1]
+
+# out[n] = (1-λ)*v0 + λ*v1
+
+
+
+# import Base: getindex
+# getindex(A::Vector{T}, i::VectorizationBase.Vec{4,Int64}) where T where d = VectorizationBase.Vec{4, T}(A[i(1)], A[i(2)], A[i(3)], A[i(4)])
+
+# getindex(values, i_)
+
+
+# values[i_]
+
+# i_(1)
\ No newline at end of file
diff --git a/src/splines/macros.jl b/src/splines/macros.jl
index e0955eea..2629ae3f 100644
--- a/src/splines/macros.jl
+++ b/src/splines/macros.jl
@@ -21,31 +21,59 @@ using StaticArrays
 #     0.0 0.0  1.0  0.0
 #  ]
 
-const Ad = [
+const Ad64 = [
    [-1.0/6.0  3.0/6.0 -3.0/6.0 1.0/6.0];
    [ 3.0/6.0 -6.0/6.0  0.0/6.0 4.0/6.0];
    [-3.0/6.0  3.0/6.0  3.0/6.0 1.0/6.0];
    [ 1.0/6.0  0.0/6.0  0.0/6.0 0.0/6.0]
 ]
 
-const dAd = [
+const dAd64 = [
    [ 0.0 -0.5  1.0 -0.5];
    [ 0.0  1.5 -2.0  0.0];
    [ 0.0 -1.5  1.0  0.5];
    [ 0.0  0.5  0.0  0.0]
 ]
 
-const d2Ad = [
+const d2Ad64 = [
    [ 0.0 0.0 -1.0  1.0];
    [ 0.0 0.0  3.0 -2.0];
    [ 0.0 0.0 -3.0  1.0];
    [ 0.0 0.0  1.0  0.0]
 ]
 
+const Ad32 = [
+   [-1.0f0/6.0f0  3.0f0/6.0f0 -3.0f0/6.0f0 1.0f0/6.0f0];
+   [ 3.0f0/6.0f0 -6.0f0/6.0f0  0.0f0/6.0f0 4.0f0/6.0f0];
+   [-3.0f0/6.0f0  3.0f0/6.0f0  3.0f0/6.0f0 1.0f0/6.0f0];
+   [ 1.0f0/6.0f0  0.0f0/6.0f0  0.0f0/6.0f0 0.0f0/6.0f0]
+]
+
+const dAd32 = [
+   [ 0.0f0 -0.5f0  1.0f0 -0.5f0];
+   [ 0.0f0  1.5f0 -2.0f0  0.0f0];
+   [ 0.0f0 -1.5f0  1.0f0  0.5f0];
+   [ 0.0f0  0.5f0  0.0f0  0.0f0]
+]
+
+const d2Ad32 = [
+   [ 0.0f0 0.0f0 -1.0f0  1.0f0];
+   [ 0.0f0 0.0f0  3.0f0 -2.0f0];
+   [ 0.0f0 0.0f0 -3.0f0  1.0f0];
+   [ 0.0f0 0.0f0  1.0f0  0.0f0]
+]
+
 U(s,i) = Symbol(string(s,i))
 U(s,i,j) = Symbol(string(s,i,"_",j))
 
-function create_Phi(d, extrap, diff)
+function create_Phi(d, extrap, diff; Tf=Float64)
+    if Tf==Float64
+        Ad = Ad64
+        dAd = dAd64
+    elseif Tf==Float32
+        Ad = Ad32
+        dAd = dAd32
+    end
     lines = []
     for i=1:d
         block = []
@@ -113,13 +141,13 @@ tensor_prod(["Phi_1", "Phi_2"], Int64[])   # Phi_1_1 * (Phi_2_1 * C[i1 + 1,i2 +
 tensor_prod(["Phi_1", "dPhi_2"], Int64[])  # Phi_1_1 * (dPhi_2_1 * C[i1 + 1,i2 + 1] + dPhi_2_2 * C[i1 + 1,i2 + 2] + dPhi_2_3 * C[i1 + 1,i2 + 3] + dPhi_2_4 * C[i1 + 1,i2 + 4]) + Phi_1_2 * (dPhi_2_1 * C[i1 + 2,i2 + 1] + dPhi_2_2 * C[i1 + 2,i2 + 2] + dPhi_2_3 * C[i1 + 2,i2 + 3] + dPhi_2_4 * C[i1 + 2,i2 + 4]) + Phi_1_3 * (dPhi_2_1 * C[i1 + 3,i2 + 1] + dPhi_2_2 * C[i1 + 3,i2 + 2] + dPhi_2_3 * C[i1 + 3,i2 + 3] + dPhi_2_4 * C[i1 + 3,i2 + 4]) + Phi_1_4 * (dPhi_2_1 * C[i1 + 4,i2 + 1] + dPhi_2_2 * C[i1 + 4,i2 + 2] + dPhi_2_3 * C[i1 + 4,i2 + 3] + dPhi_2_4 * C[i1 + 4,i2 + 4])
 
 
-function create_parameters(d)
+function create_parameters(d; Tf=Float64)
     lines = []
     for i=1:d
         block = quote
             $(U("M",i)) = orders[$i]
             $(U("start",i)) =  a[$i]
-            $(U("dinv",i)) = (orders[$i]-1.0)/(b[$i]-a[$i])
+            $(U("dinv",i)) = (orders[$i]-convert($Tf,1.0))/(b[$i]-a[$i])
         end
         for ll in block.args
             push!(lines, ll)
@@ -128,20 +156,21 @@ function create_parameters(d)
     return lines
 end
 
-function create_local_parameters(d; vectorize=true)
+function create_local_parameters(d; vectorize=true, Tf=Float64)
     lines = []
     if vectorize
         for i=1:d
             bl = quote
                 $(U("x",i)) = S[n][$i]
                 $(U("u",i)) = ($(U("x",i)) - $(U("start",i)))*$(U("dinv",i))
-                $(U("i",i)) = (floor(Int,$(U("u",i)) ))
+                # $(U("i",i)) = (floor(Int,$(U("u",i)) ))
+                $(U("i",i)) = unsafe_trunc(Int32, $(U("u",i)))
                 $(U("i",i)) = max( min($(U("i",i)),$(U("M",i))-2), 0 )
                 $(U("t",i)) = $(U("u",i))-$(U("i",i))
                 $(U("tp",i,1)) = $(U("t",i))*$(U("t",i))*$(U("t",i))
                 $(U("tp",i,2)) = $(U("t",i))*$(U("t",i))
                 $(U("tp",i,3)) = $(U("t",i))
-                $(U("tp",i,4)) = 1.0;
+                $(U("tp",i,4)) = convert($Tf,1.0);   #TODO
             end
             for ll in bl.args
                 push!(lines, ll)
@@ -153,13 +182,14 @@ function create_local_parameters(d; vectorize=true)
             bl = quote
                 $(U("x",i)) = S[$i]
                 $(U("u",i)) = ($(U("x",i)) - $(U("start",i)))*$(U("dinv",i))
-                $(U("i",i)) = (floor(Int,$(U("u",i)) ))
+                # $(U("i",i)) = (floor(Int,$(U("u",i)) ))
+                $(U("i",i)) = unsafe_trunc(Int32, $(U("u",i)) )
                 $(U("i",i)) = max( min($(U("i",i)),$(U("M",i))-2), 0 )
                 $(U("t",i)) = $(U("u",i))-$(U("i",i))
                 $(U("tp",i,1)) = $(U("t",i))*$(U("t",i))*$(U("t",i))
                 $(U("tp",i,2)) = $(U("t",i))*$(U("t",i))
                 $(U("tp",i,3)) = $(U("t",i))
-                $(U("tp",i,4)) = 1.0;
+                $(U("tp",i,4)) = convert($Tf,1.0);
             end
             for ll in bl.args
                 push!(lines, ll)
@@ -169,17 +199,17 @@ function create_local_parameters(d; vectorize=true)
     return lines
 end
 
-function create_function(d,extrap="natural"; vectorize=true)
+function create_function(d,extrap="natural"; vectorize=true, Tf=Float32)
     if vectorize
         expr = quote
-            function $(Symbol(string("eval_UC_spline!")))( a, b, orders, C::Array{T,d}, S::Vector{SVector{d,Float64}}, V::Vector{T}) where d where T
-                $(create_parameters(d)...)
+            function $(Symbol(string("eval_UC_spline!")))( a, b, orders, C::Array{T,d}, S::Vector{SVector{d,$Tf}}, V::Vector{T}) where d where T
+                $(create_parameters(d; Tf=Tf)...)
                 N = size(S,1)
                 # @fastmath @inbounds @simd( # doesn't seem to make any difference
                 for n=1:N
-                    $(create_local_parameters(d)...)
-                    $(create_Phi(d,extrap,false)...)
-                    V[n] = $( tensor_prod([string("Phi_",i) for i=1:d], Int64[]) )
+                    $(create_local_parameters(d;Tf=Tf)...)
+                    $(create_Phi(d,extrap,false;Tf=Tf)...)
+                    V[n] = $( tensor_prod([string("Phi_",i) for i=1:d], Int64[]; Tf=Tf) )
                 end
                 # )
             end
@@ -187,11 +217,11 @@ function create_function(d,extrap="natural"; vectorize=true)
     else
         expr = quote
             function $(Symbol(string("eval_UC_spline")))( a, b, orders, C::Array{T,d}, S::SVector{d,U}) where d where T where U
-                $(create_parameters(d)...)
+                $(create_parameters(d,Tf=Tf)...)
                 N = size(S,1)
                 # @fastmath @inbounds @simd( # doesn't seem to make any difference
-                $(create_local_parameters(d; vectorize=false)...)
-                $(create_Phi(d,extrap,false)...)
+                $(create_local_parameters(d; vectorize=false,Tf=Tf)...)
+                $(create_Phi(d,extrap,false; Tf=Tf)...)
                 V = $( tensor_prod([string("Phi_",i) for i=1:d], Int64[]) )
                 # )
                 return V

From 31a6252219e2e2d5627ff2d749a4c9323cdf9b25 Mon Sep 17 00:00:00 2001
From: Pablo Winant <pablo.winant@gmail.com>
Date: Sun, 26 May 2024 23:19:43 +0200
Subject: [PATCH 07/17] WIP

---
 misc/dev_float32.jl         | 25 ++++++++++++++++++-------
 misc/rbc_float32.jl         |  8 ++++----
 src/algos/time_iteration.jl |  6 +++++-
 3 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/misc/dev_float32.jl b/misc/dev_float32.jl
index 45e3fc08..1269e4c1 100644
--- a/misc/dev_float32.jl
+++ b/misc/dev_float32.jl
@@ -6,20 +6,24 @@ model_32 = include("$(root_dir)/misc/rbc_float32.jl")
 
 model32 = Dolo.convert_precision(Float32, model_32)
 
-dm32 = Dolo.discretize(model32, Dict(:endo=>[100]) )
+dm32 = Dolo.discretize(model32, Dict(:endo=>[1000]) )
 
 
 typeof(dm32)
 
 using Adapt
 
+import oneAPI: oneArray
+import Adapt: adapt_structure
+import CUDA: CuArray
+
 
 wk0 = Dolo.time_iteration_workspace(dm32)
 
-import oneAPI: oneArray
-import Adapt: adapt_structure
+wk = Dolo.time_iteration_workspace(dm32, dest=CuArray)
 
-wk = adapt(oneArray, wk0)
+# wk = adapt(oneArray, wk0)
+wk = adapt(CuArray, wk0)
 using KernelAbstractions: get_backend
 
 if typeof(dm32.grid)<:Dolo.CGrid
@@ -64,14 +68,21 @@ using StaticArrays
 
 end
 
-
+using BenchmarkTools
 
 fun_ = ggg(t_e)
+
 @time fun_(wk.r0, dm32, wk.x0, wk.φ; ndrange=(p,q))
 
 
+@benchmark fun_(wk.r0, dm32, wk.x0, wk.φ; ndrange=(p,q))
+
+
+@benchmark Dolo.F!(wk0.r0, dm32, wk0.x0, wk0.φ)
+
+
+Dolo.distance(wk0.r0, adapt(Array,wk.r0))
 
-@time Dolo.F!(wk0.r0, dm32, wk0.x0, wk0.φ)
 
 # try
 
@@ -82,7 +93,7 @@ fun_ = ggg(t_e)
 
 
 
-# Dolo.time_iteration(dm32, wk; engine=:gpu)
+Dolo.time_iteration(dm32, wk; engine=:gpu)
 
 
 s_0 = dm32.grid[10]
diff --git a/misc/rbc_float32.jl b/misc/rbc_float32.jl
index d6beddf4..b6b65fb6 100644
--- a/misc/rbc_float32.jl
+++ b/misc/rbc_float32.jl
@@ -65,7 +65,7 @@ function Dolo.transition(model::typeof(model32), s::NamedTuple, x::NamedTuple, M
     (;δ, ρ) = model.calibration
     
     # Z = e.Z
-    K = s.k * (1-δ) + x.i
+    K = s.k * (1f0-δ) + x.i
 
     (;k=K,)  ## This is only the endogenous state
 
@@ -77,8 +77,8 @@ function intermediate(model::typeof(model32),s::NamedTuple, x::NamedTuple)
     
     p = model.calibration
 
-	y = exp(s.z)*(s.k^p.α)*(x.n^(1-p.α))
-	w = (1-p.α)*y/x.n
+	y = exp(s.z)*(s.k^p.α)*(x.n^(1f0-p.α))
+	w = (1f0-p.α)*y/x.n
 	rk = p.α*y/s.k
 	c = y - x.i
 	return ( (; y, c, rk, w))
@@ -93,7 +93,7 @@ function arbitrage(model::typeof(model32), s::NamedTuple, x::NamedTuple, S::Name
 	y = intermediate(model, s, x)
 	Y = intermediate(model, S, X)
 	res_1 = p.χ*(x.n^p.η)*(y.c^p.σ) - y.w
-	res_2 = (p.β*(y.c/Y.c)^p.σ)*(1 - p.δ + Y.rk) - 1
+	res_2 = (p.β*(y.c/Y.c)^p.σ)*(1f0 - p.δ + Y.rk) - 1f0
     
     return ( (;res_1, res_2) )
 
diff --git a/src/algos/time_iteration.jl b/src/algos/time_iteration.jl
index f96a18bf..95a6334c 100644
--- a/src/algos/time_iteration.jl
+++ b/src/algos/time_iteration.jl
@@ -148,7 +148,11 @@ function time_iteration_workspace(dmodel; interp_mode=:linear, dest=Array)
 
     tt = (;x0, x1, x2, r0, dx, J, φ)
 
-    return adapt(dest, tt)
+    res = NamedTuple(
+        ( (k=>adapt(dest, v)) for (k,v) in zip(keys(tt),values(tt)) )
+    )
+    return res
+    # return adapt(dest, tt)
 
 end
 

From f2f67540b0ca973e76befb3b0c766c09305c1f4a Mon Sep 17 00:00:00 2001
From: Pablo Winant <pablo.winant@gmail.com>
Date: Mon, 27 May 2024 09:18:24 +0200
Subject: [PATCH 08/17] Almost there !

---
 misc/dev_float32.jl                     | 62 +++-------------------
 src/adapt.jl                            |  3 ++
 src/algos/time_iteration_accelerated.jl | 69 +++++++++++++++----------
 3 files changed, 53 insertions(+), 81 deletions(-)

diff --git a/misc/dev_float32.jl b/misc/dev_float32.jl
index 1269e4c1..24be933e 100644
--- a/misc/dev_float32.jl
+++ b/misc/dev_float32.jl
@@ -6,7 +6,7 @@ model_32 = include("$(root_dir)/misc/rbc_float32.jl")
 
 model32 = Dolo.convert_precision(Float32, model_32)
 
-dm32 = Dolo.discretize(model32, Dict(:endo=>[1000]) )
+dm32 = Dolo.discretize(model32, Dict(:endo=>[100000]) )
 
 
 typeof(dm32)
@@ -23,6 +23,7 @@ wk0 = Dolo.time_iteration_workspace(dm32)
 wk = Dolo.time_iteration_workspace(dm32, dest=CuArray)
 
 # wk = adapt(oneArray, wk0)
+
 wk = adapt(CuArray, wk0)
 using KernelAbstractions: get_backend
 
@@ -39,50 +40,6 @@ t_e = get_backend(wk.x0)
 using KernelAbstractions
 using StaticArrays
 
-@kernel function ggg(r, dm, x, φ)
-# @kernel function fun(dm, r, x)
-
-
-    n = @index(Global, Linear)
-    ind = @index(Global, Cartesian)
-    (i,j) = ind.I
-
-
-    # k = length(dm.grid.grids[1])
-    # i_ = (n-1)÷k
-    # j_ = (n-1) - (i)*κ
-    # i = i_+1
-    # j = j_+1
-    
-    # TODO: special function here
-    s_ = dm.grid[n]
-    s = Dolo.QP((i,j), s_)
-
-    xx = x[n]
-
-    # (i,j) = @inline Dolo.from_linear(model.grid, n)    
-
-    rr = Dolo.F(dm, s, xx, φ)
-
-    r[n] = rr
-
-end
-
-using BenchmarkTools
-
-fun_ = ggg(t_e)
-
-@time fun_(wk.r0, dm32, wk.x0, wk.φ; ndrange=(p,q))
-
-
-@benchmark fun_(wk.r0, dm32, wk.x0, wk.φ; ndrange=(p,q))
-
-
-@benchmark Dolo.F!(wk0.r0, dm32, wk0.x0, wk0.φ)
-
-
-Dolo.distance(wk0.r0, adapt(Array,wk.r0))
-
 
 # try
 
@@ -91,16 +48,13 @@ Dolo.distance(wk0.r0, adapt(Array,wk.r0))
     # nothing
 # end
 
+wk0 = Dolo.time_iteration_workspace(dm32)
+wk = adapt(CuArray, wk0)
+@time Dolo.time_iteration(dm32, wk, verbose=false; engine=:gpu);
 
-
-Dolo.time_iteration(dm32, wk; engine=:gpu)
-
-
-s_0 = dm32.grid[10]
-s0 = Dolo.QP((2,2), s_0)
-xx0 = wk0.x0[10]
+@time Dolo.time_iteration(dm32, wk0, verbose=false; engine=:gpu);
 
 
-fon(dm32, s0, xx0) = sum(w*S.val for (w,S) in Dolo.τ(dm32, s0, xx0))
+Dolo.distance(adapt(Array,wk.x0) , wk0.x0)
 
-@code_warntype fon(dm32, s0, xx0)
\ No newline at end of file
+# 357 times faster !
\ No newline at end of file
diff --git a/src/adapt.jl b/src/adapt.jl
index 10362ef8..42ca3d1c 100644
--- a/src/adapt.jl
+++ b/src/adapt.jl
@@ -34,3 +34,6 @@ end
 import KernelAbstractions: get_backend
 
 get_backend(g::GArray) = get_backend(g.data)
+
+import CUDA: CuArray
+distance(x::GVector{G, A}, y::GVector{G,A}) where G where A<:CuArray = Base.mapreduce(u->maximum(u), max, x.data-y.data)
\ No newline at end of file
diff --git a/src/algos/time_iteration_accelerated.jl b/src/algos/time_iteration_accelerated.jl
index a4ca41f4..e1d9f125 100644
--- a/src/algos/time_iteration_accelerated.jl
+++ b/src/algos/time_iteration_accelerated.jl
@@ -39,23 +39,30 @@ using KernelAbstractions
 # This is for a PGrid model only 
 function F!(r, model, x, φ, engine)
 
-    @kernel function FF_(r, @Const(model), @Const(x), @Const(φ))
-
+    @kernel function FF_(r, @Const(dm), @Const(x), @Const(φ))
 
         n = @index(Global, Linear)
-        (i,j) = Dolo.from_linear(model.grid, n)
-
-        s_ = model.grid[n]
-        s = QP((i,j), s_)
-        xx = x[n]
-        
-        rr = sum(
-            w*Dolo.arbitrage(model,s,xx,S,φ(S)) 
-            for (w,S) in Dolo.τ(model, s, xx)
-        )      
+        ind = @index(Global, Cartesian)
+        (i,j) = ind.I
+    
+        # k = length(dm.grid.grids[1])
+        # i_ = (n-1)÷k
+        # j_ = (n-1) - (i)*κ
+        # i = i_+1
+        # j = j_+1
         
-        r[i,j] = rr
-
+        # TODO: special function here
+        s_ = dm.grid[n]
+        s = Dolo.QP((i,j), s_)
+    
+        xx = x[n]
+    
+        # (i,j) = @inline Dolo.from_linear(model.grid, n)    
+    
+        rr = Dolo.F(dm, s, xx, φ)
+    
+        r[n] = rr
+    
     end
 
     fun_cpu = FF_(engine)
@@ -79,28 +86,36 @@ end
 
 
 
-function dF_1!(out, model, controls::GArray, φ::Union{GArray, DFun}, ::CPU)
+function dF_1!(out, model, controls::GArray, φ::Union{GArray, DFun}, engine)
 
-    @kernel function FF_(r,@Const(model), @Const(x),@Const(φ) )
+    @kernel function FF_(r,@Const(dm), @Const(x),@Const(φ) )
 
-        # c = @index(Global, Cartesian)
-        # i,j = c.I
 
         n = @index(Global, Linear)
-        # i,j = c.I
-        (i,j) = Dolo.from_linear(model.grid, n)
-
-        s_ = model.grid[n]
-        s = QP((i,j), s_)
-        xx = x[i,j]
+        ind = @index(Global, Cartesian)
+        (i,j) = ind.I
+    
+        # k = length(dm.grid.grids[1])
+        # i_ = (n-1)÷k
+        # j_ = (n-1) - (i)*κ
+        # i = i_+1
+        # j = j_+1
         
+        # TODO: special function here
+        s_ = dm.grid[n]
+        s = Dolo.QP((i,j), s_)
+    
+        xx = x[n]
+    
+        # (i,j) = @inline Dolo.from_linear(model.grid, n)    
+    
         rr = ForwardDiff.jacobian(u->F(model, s, u, φ), xx)
-        
-        r[i,j] = rr
+    
+        r[n] = rr
 
     end
 
-    fun_cpu = FF_(CPU())
+    fun_cpu = FF_(engine)
 
     if typeof(model.grid)<:CGrid
         p = model.grid.ranges[1][3]

From f228dec33f317189ce3e3b082eb9c773a84080f1 Mon Sep 17 00:00:00 2001
From: Pablo Winant <pablo.winant@gmail.com>
Date: Tue, 28 May 2024 16:34:45 +0200
Subject: [PATCH 09/17] GPUfied splines 1/2

---
 Project.toml                            |   3 +-
 examples/ymodels/rbc_mc.jl              |   3 +-
 misc/dev_float32.jl                     |  71 +++++++---
 src/adapt.jl                            |  13 +-
 src/algos/time_iteration.jl             |  36 +++--
 src/algos/time_iteration_accelerated.jl | 179 ++++++++++++++----------
 src/dev_L2.jl                           |  53 +++++--
 src/garray.jl                           |   8 +-
 src/grids.jl                            |   1 +
 src/splines/interp.jl                   |   6 +-
 src/splines/macros.jl                   |  30 +++-
 11 files changed, 260 insertions(+), 143 deletions(-)

diff --git a/Project.toml b/Project.toml
index fbd43119..31e1a6b2 100644
--- a/Project.toml
+++ b/Project.toml
@@ -11,7 +11,6 @@ BlockDiagonals = "0a1fb500-61f7-11e9-3c65-f5ef3456f9f0"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Continuables = "79afa230-ca09-11e8-120b-5decf7bf5e25"
 Crayons = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
-Cthulhu = "f68482b8-f384-11e8-15f7-abe071a5a75f"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
@@ -30,7 +29,6 @@ LabelledArrays = "2ee39098-c373-598a-b85f-a56591580800"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LinearMaps = "7a12625a-238d-50fd-b39a-03d52299707e"
 LinearOperators = "5c8ed15e-5a4c-59e4-a42b-c7e8811fb125"
-LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 NLsolve = "2774e3e8-f4cf-5e23-947b-6d7e65073b56"
 Optim = "429524aa-4258-5aef-a3af-852621145aeb"
@@ -47,3 +45,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
+
diff --git a/examples/ymodels/rbc_mc.jl b/examples/ymodels/rbc_mc.jl
index a27ae097..c5ec190e 100644
--- a/examples/ymodels/rbc_mc.jl
+++ b/examples/ymodels/rbc_mc.jl
@@ -1,6 +1,6 @@
 
 using StaticArrays
-import Dolo: transition, arbitrage
+import Dolo: transition, arbitrage, intermediate
 
 model = let 
 
@@ -87,6 +87,7 @@ function arbitrage(model::typeof(model), s::NamedTuple, x::NamedTuple, S::NamedT
 
 	y = intermediate(model, s, x)
 	Y = intermediate(model, S, X)
+
 	res_1 = p.χ*(x.n^p.η)*(y.c^p.σ) - y.w
 	res_2 = (p.β*(y.c/Y.c)^p.σ)*(1 - p.δ + Y.rk) - 1
     
diff --git a/misc/dev_float32.jl b/misc/dev_float32.jl
index 24be933e..801bdb2f 100644
--- a/misc/dev_float32.jl
+++ b/misc/dev_float32.jl
@@ -6,41 +6,57 @@ model_32 = include("$(root_dir)/misc/rbc_float32.jl")
 
 model32 = Dolo.convert_precision(Float32, model_32)
 
-dm32 = Dolo.discretize(model32, Dict(:endo=>[100000]) )
-
+dm32 = Dolo.discretize(model32, Dict(:endo=>[1000]) )
 
 typeof(dm32)
 
-using Adapt
 
+using Adapt
 import oneAPI: oneArray
 import Adapt: adapt_structure
-import CUDA: CuArray
+# import CUDA: CuArray
 
 
-wk0 = Dolo.time_iteration_workspace(dm32)
+wk0 = Dolo.time_iteration_workspace(dm32);
 
-wk = Dolo.time_iteration_workspace(dm32, dest=CuArray)
+# wk = Dolo.time_iteration_workspace(dm32, dest=CuArray)
+wk = Dolo.time_iteration_workspace(dm32, dest=oneArray);
+
+
+wk = adapt(oneArray, wk0)
+# wk = adapt(CuArray, wk0)
 
-# wk = adapt(oneArray, wk0)
 
-wk = adapt(CuArray, wk0)
 using KernelAbstractions: get_backend
 
-if typeof(dm32.grid)<:Dolo.CGrid
-    p = dm32.grid.ranges[1][3]
-    q = dm32.grid.ranges[2][3]
-else
-    p = length(dm32.grid.grids[1])
-    q = length(dm32.grid.grids[2])
-end
 
 t_e = get_backend(wk.x0)
 
-using KernelAbstractions
+using KernelAbstractions: CPU
 using StaticArrays
 
 
+L0 = Dolo.dF_2(dm32, wk0.x0*0, wk0.φ)
+L0_gpu = adapt(oneArray, L0)
+
+L = deepcopy(L0)
+
+@time Dolo.dF_2!(L, dm32, wk0.x0, wk0.φ, CPU())
+
+# @time Dolo.dd_2!(L, dm32, wk0.x0, wk0.φ, CPU())
+
+
+@time Dolo.dF_2!(L0_gpu, dm32, wk.x0, wk.φ, t_e)
+
+Lg = adapt(Array, L0_gpu)
+
+
+@time L*wk0.x0;
+
+#It works !
+using BenchmarkTools
+@benchmark Dolo.mul!(wk0.r0, L, wk0.x0, CPU())
+@benchmark Dolo.mul!(wk.r0, L0_gpu, wk.x0, t_e)
 # try
 
     # res = fun_(wk.r0, dm32, wk.x0, wk.φ; ndrange=(p,q))
@@ -48,13 +64,24 @@ using StaticArrays
     # nothing
 # end
 
-wk0 = Dolo.time_iteration_workspace(dm32)
-wk = adapt(CuArray, wk0)
-@time Dolo.time_iteration(dm32, wk, verbose=false; engine=:gpu);
+wk0 = Dolo.time_iteration_workspace(dm32; interp_mode=:cubic)
+# wk = adapt(CuArray, wk0)
+wk = adapt(oneArray, wk0)
+
+@time Dolo.time_iteration(dm32, wk, verbose=true, improve=true; engine=:gpu, improve_wait=10, improve_K=500, T=20);
+
+
+
+
+@time Dolo.time_iteration(dm32, wk0, verbose=true, improve=false;  T=20);
+
+
+
 
-@time Dolo.time_iteration(dm32, wk0, verbose=false; engine=:gpu);
+using ForwardDiff
 
+ForwardDiff.derivative(u->unsafe_trunc(u), 0.1)
 
-Dolo.distance(adapt(Array,wk.x0) , wk0.x0)
+ForwardDiff.jacobian(u->trunc.(Int, u), SVector(0.3, 0.2))
 
-# 357 times faster !
\ No newline at end of file
+ForwardDiff.jacobian(u->unsafe_trunc.(Int, u), SVector(0.3, 0.2))
diff --git a/src/adapt.jl b/src/adapt.jl
index 42ca3d1c..90702d08 100644
--- a/src/adapt.jl
+++ b/src/adapt.jl
@@ -31,9 +31,14 @@ function adapt_structure(to, f::DFun{Dom, Gar, Itp, vars}) where Dom where Gar w
     )
 end
 
-import KernelAbstractions: get_backend
+function adapt_structure(to, L::LL{G,D,F}) where G where D where F
+    LL(L.grid, adapt(to, L.D), adapt(to, L.φ))
+end
+
 
-get_backend(g::GArray) = get_backend(g.data)
+# should it be merged with the general definition?
+# import CUDA: CuArray
+# distance(x::GVector{G, A}, y::GVector{G,A}) where G where A<:CuArray = Base.mapreduce(u->maximum(u), max, x.data-y.data)
 
-import CUDA: CuArray
-distance(x::GVector{G, A}, y::GVector{G,A}) where G where A<:CuArray = Base.mapreduce(u->maximum(u), max, x.data-y.data)
\ No newline at end of file
+import oneAPI: oneArray
+distance(x::GVector{G, A}, y::GVector{G,A}) where G where A<:oneArray = Base.mapreduce(u->maximum(u), max, x.data-y.data)
\ No newline at end of file
diff --git a/src/algos/time_iteration.jl b/src/algos/time_iteration.jl
index 95a6334c..efce75fe 100644
--- a/src/algos/time_iteration.jl
+++ b/src/algos/time_iteration.jl
@@ -33,7 +33,7 @@ function F(dmodel::M, s::QP, x::SVector{d,T}, φ::Union{Policy, GArray, DFun}) w
     # )
     r
     # r::SVector{d,T}
-    r = complementarities(dmodel.model, s,x,r)
+    # r = complementarities(dmodel.model, s,x,r)
     r
 end
 
@@ -128,7 +128,7 @@ end
 using LinearMaps
 using Adapt
 
-function time_iteration_workspace(dmodel; interp_mode=:linear, dest=Array)
+function time_iteration_workspace(dmodel; interp_mode=:linear, improve=false, dest=Array)
 
     T = eltype(dmodel)
 
@@ -146,13 +146,14 @@ function time_iteration_workspace(dmodel; interp_mode=:linear, dest=Array)
     vars = variables(dmodel.model.controls)
     φ = DFun(dmodel.model.states, x0, vars; interp_mode=interp_mode)
 
-    tt = (;x0, x1, x2, r0, dx, J, φ)
+    # if improve
+        L = Dolo.dF_2(dmodel, x1, φ)
+        tt = (;x0, x1, x2, r0, dx, J, L, φ)
+    # else
+        # tt = (;x0, x1, x2, r0, dx, J, φ)
+    # end
 
-    res = NamedTuple(
-        ( (k=>adapt(dest, v)) for (k,v) in zip(keys(tt),values(tt)) )
-    )
-    return res
-    # return adapt(dest, tt)
+    return adapt(dest, tt)
 
 end
 
@@ -174,6 +175,7 @@ function time_iteration(model::YModel; kwargs...)
     time_iteration(dmodel, wksp; kwargs2...)
 end
 
+using oneAPI: oneArray
 function time_iteration(model::DYModel,
     workspace=time_iteration_workspace(model);
     T=500,
@@ -199,14 +201,18 @@ function time_iteration(model::DYModel,
 
     # mem = typeof(workspace) <: Nothing ? time_iteration_workspace(model) : workspace
     mbsteps = 5
-    lam = 0.5
+
+    (;x0, x1, x2, dx, r0, J, φ) = workspace
+
     
     local η_0 = NaN
     local η
     convergence = false
     iterations = T
     
-    (;x0, x1, x2, dx, r0, J, φ) = workspace
+    Tf = eltype(eltype(x0))
+
+    lam = convert(Tf, 0.5)
 
     if engine in (:cpu, :gpu)
         t_engine = get_backend(workspace.x0)
@@ -217,7 +223,7 @@ function time_iteration(model::DYModel,
     ti_trace = trace ? IterationTrace(typeof(φ)[]) : nothing
 
     if improve
-        J_2 = Dolo.dF_2(model, x1, φ)
+        J_2 = workspace.L
     end
 
     for t=1:T
@@ -306,12 +312,16 @@ function time_iteration(model::DYModel,
             Dolo.dF_1!(J_1, model, x1, φ, t_engine)
             Dolo.dF_2!(J_2, model, x1, φ, t_engine)
 
-            mul!(J_2, -1.0)
+            mul!(J_2, convert(Tf,-1.0))
+
             # J_2.M_ij[:] *= -1.0
             Tp = J_1 \ J_2
             
             d = (x1-x0)
-            x0.data .+= neumann(Tp, d; K=improve_K)
+
+            rhs =  neumann(Tp, d; K=improve_K)
+
+            x0.data .+= rhs.data
 
         else
             x0.data .= x1.data
diff --git a/src/algos/time_iteration_accelerated.jl b/src/algos/time_iteration_accelerated.jl
index e1d9f125..9f565672 100644
--- a/src/algos/time_iteration_accelerated.jl
+++ b/src/algos/time_iteration_accelerated.jl
@@ -1,40 +1,13 @@
 # This starts making a difference when grids have more than 200000 points...
 
 using KernelAbstractions
+import KernelAbstractions.Extras: @unroll
 
+import KernelAbstractions: get_backend
 
+get_backend(g::GArray) = get_backend(g.data)
 
-# # This is for a CGrid model only 
-# function F!(r, model, x, φ, ::CPU)
-
-#     @kernel function FF_(r, @Const(model), @Const(x), @Const(φ))
-
-#         i = @index(Global, Linear)
-
-#         s_ = model.grid[i]
-#         s = QP((i,), s_)
-#         xx = x[i]
-        
-#         rr = sum(
-#             w*Dolo.arbitrage(model,s,xx,S,φ(S)) 
-#             for (w,S) in Dolo.τ(model, s, xx)
-#         )      
-        
-#         r[i] = rr
-
-#     end
-
-#     fun_cpu = FF_(CPU())
-
-
-#     p = length(model.grid)
-    
-#     # p,q = size(x)
-
-#     res = fun_cpu(r, model, x, φ; ndrange=(p,))
-#     wait(res)
 
-# end
 
 # This is for a PGrid model only 
 function F!(r, model, x, φ, engine)
@@ -42,6 +15,7 @@ function F!(r, model, x, φ, engine)
     @kernel function FF_(r, @Const(dm), @Const(x), @Const(φ))
 
         n = @index(Global, Linear)
+
         ind = @index(Global, Cartesian)
         (i,j) = ind.I
     
@@ -68,19 +42,10 @@ function F!(r, model, x, φ, engine)
     fun_cpu = FF_(engine)
 
 
-    # p = length(model.grid.grids[1])
-    # q = length(model.grid.grids[2])
-    # p,q = size(x)
-    if typeof(model.grid)<:CGrid
-        p = model.grid.ranges[1][3]
-        q = model.grid.ranges[2][3]
-    else
-        p = length(model.grid.grids[1])
-        q = length(model.grid.grids[2])
-    end
-    
-    res = fun_cpu(r, model, x, φ; ndrange=(p,q))
-    # wait(res)
+    sz = size(model.grid)
+
+    res = fun_cpu(r, model, x, φ; ndrange=sz)
+
 
 end
 
@@ -117,62 +82,91 @@ function dF_1!(out, model, controls::GArray, φ::Union{GArray, DFun}, engine)
 
     fun_cpu = FF_(engine)
 
-    if typeof(model.grid)<:CGrid
-        p = model.grid.ranges[1][3]
-        q = model.grid.ranges[2][3]
-    else
-        p = length(model.grid.grids[1])
-        q = length(model.grid.grids[2])
-    end
-    # p,q = size(out)
+    sz = size(model.grid)
 
-    res = fun_cpu(out, model, controls, φ; ndrange=(p,q))
+    res = fun_cpu(out, model, controls, φ; ndrange=sz)
     # wait(res)
 
 end    #### no alloc
     
+function dF_2!(out, dmodel, controls::GArray, φ::DFun, engine)
 
-function dF_2!(L, dmodel, xx::GArray, φ::DFun, ::CPU)
+    @kernel function FF_(L,@Const(dm), @Const(x),@Const(φ) )
 
-    # for (n,(s,x)) in enumerate(zip(Dolo.enum(dmodel.grid), xx))
-    Threads.@threads for n=1:length(dmodel.grid)
 
-        i,j = Dolo.from_linear(dmodel.grid, n)
-            
-        s_ = dmodel.grid[i,j]
-        s = QP((i,j), s_)
-        x = xx[n]
-
-        L.D[n] = tuple(
-                (
-                    (;
-                        F_x=w*ForwardDiff.jacobian(u->Dolo.arbitrage(dmodel,s,x,S,u), φ(S)),
-                        S=S
-                    )
-                for (w,S) in Dolo.τ(dmodel, s, x)
+        n = @index(Global, Linear)
+        ind = @index(Global, Cartesian)
+        (i,j) = ind.I
+    
+        # TODO: special function here
+        s_ = dm.grid[n]
+        s = Dolo.QP((i,j), s_)
+    
+        xx = x[n]
+    
+        tt = tuple(
+            (
+                (;
+                    F_x=w*ForwardDiff.jacobian(u->Dolo.arbitrage(dmodel,s,xx,S,u), φ(S)),
+                    S=S
                 )
+            for (w,S) in Dolo.τ(dmodel, s, xx)
+            )
         ...)
 
+        L.D[n] = tt
+
     end
 
+    fun_cpu = FF_(engine)
+    sz = size(dmodel.grid)
+
+    res = fun_cpu(out, dmodel, controls, φ; ndrange=sz)
+
     nothing
 
 
+
 end
 
-using KernelAbstractions
-import KernelAbstractions.Extras: @unroll
 
-function mul!(dr, L2::Dolo.LL, x, ::CPU)
+# function dF_2!(L, dmodel, xx::GArray, φ::DFun, ::CPU)
+
+#     # for (n,(s,x)) in enumerate(zip(Dolo.enum(dmodel.grid), xx))
+#     Threads.@threads for n=1:length(dmodel.grid)
+
+#         i,j = Dolo.from_linear(dmodel.grid, n)
+            
+#         s_ = dmodel.grid[i,j]
+#         s = QP((i,j), s_)
+#         x = xx[n]
+
+#         L.D[n] = tuple(
+#                 (
+#                     (;
+#                         F_x=w*ForwardDiff.jacobian(u->Dolo.arbitrage(dmodel,s,x,S,u), φ(S)),
+#                         S=S
+#                     )
+#                 for (w,S) in Dolo.τ(dmodel, s, x)
+#                 )
+#         ...)
+
+#     end
+
+#     nothing
+
+
+# end
+
+function mul!(dr, L2::Dolo.LL, x, engine)
  
     D = L2.D
     dφ = L2.φ
 
     fit!(dφ, x)
 
-
-    @kernel function tm_(dr, @Const(L2), @Const(x))
-        n = @index(Global)
+    @kernel function tm_(dr, @Const(D), @Const(dφ))
+        n = @index(Global, Linear)
         t0 = zero(eltype(dr))
         @unroll for k=1:length(D[n])    # @unroll provides  a 20% gain
             (;F_x, S) = D[n][k]      # profile: slow ?
@@ -181,11 +175,42 @@ function mul!(dr, L2::Dolo.LL, x, ::CPU)
         dr[n] = t0
     end
 
-    fun = tm_(CPU())
+    fun = tm_(engine)
 
     K = length(D)
-    res = fun(dr, L2, x; ndrange=K)
+    res = fun(dr, D, dφ; ndrange=K)
+
+    synchronize(engine)
     # wait(res)
-    res
+    nothing
+
+end
+
+
+function \(J, L::Dolo.LL) 
+
+
+    @kernel function tm_(@Const(J), L)
+        n = @index(Global)
+        
+        elt = L.D[n]
+        M = J.data[n]
+        tt = tuple((
+            (;F_x=M\d.F_x, S=d.S) 
+            for d in elt
+        )...)
+        L.D[n] = tt
+    
+    end
+
+    engine = get_backend(J.data)
+    fun = tm_(engine)
+
+    K = length(J.data)
+
+    nL = deepcopy(L)
+    res = fun(J, nL; ndrange=K)
+
+    return nL
 
 end
diff --git a/src/dev_L2.jl b/src/dev_L2.jl
index 39c9b997..9ab5f68d 100644
--- a/src/dev_L2.jl
+++ b/src/dev_L2.jl
@@ -12,18 +12,37 @@ const LF = LL
 import Base: /,*
 import LinearAlgebra: mul!
 
+# function mul!(L::LL, x::Number)
+
+#     for n in 1:length(L.D)
+#         L.D[n]  =tuple((
+#             (;F_x=x*d.F_x, S=d.S) 
+#             for d in L.D[n]
+#         )...)
+#     end
+# end
+
 function mul!(L::LL, x::Number)
 
-    for n in 1:length(L.D)
-        L.D[n]  =tuple((
+    map!(
+        e->tuple((
             (;F_x=x*d.F_x, S=d.S) 
-            for d in L.D[n]
-        )...)
-    end
+            for d in e
+        )...),
+        L.D,
+        L.D
+    )
+    # for n in 1:length(L.D)
+    #     L.D[n]  =tuple((
+    #         (;F_x=x*d.F_x, S=d.S) 
+    #         for d in L.D[n]
+    #     )...)
+    # end
 end
 
 function mul!(dr, L2::LF, x)
-    mul!(dr, L2::LF, x, CPU())
+    engine=get_backend(L2.D)
+    mul!(dr, L2::LF, x, engine)
 end
 
 
@@ -45,10 +64,13 @@ end
 
 
 function *(L::LL, x0)
-    r = deepcopy(x0)
-    r.data .*= 0.0
+
+    # BUG: with deepcopy, doesn' t work
+    r = duplicate(x0) ###### BUG: this doesn't wokr on the GPU
+    Tf = getprecision(x0)
+    r.data .*= convert(Tf, 0.0)
     mul!(r, L, x0)
-    return r
+
 end
 
 # this takes 0.2 s !
@@ -127,9 +149,14 @@ end
 
 function neumann(L2::LF, r0; K=1000, τ_η=1e-10)
     
-    dx = deepcopy(r0)
-    du = deepcopy(r0)
-    dv = deepcopy(r0)
+    # TODO
+    # dx = deepcopy(r0)
+    # du = deepcopy(r0)
+    # dv = deepcopy(r0)
+
+    dx = duplicate(r0)
+    du = duplicate(r0)
+    dv = duplicate(r0)
 
     mem = (;du, dv)
 
@@ -139,7 +166,7 @@ function neumann(L2::LF, r0; K=1000, τ_η=1e-10)
 
 end
 
-function neumann!(dx, L2::LF, r0, mem=(;du=deepcopy(r0), dv=deepcopy(r0)); K=1000, τ_η=1e-10)
+function neumann!(dx, L2::LF, r0, mem=(;du=duplicate(r0), dv=duplicate(r0)); K=1000, τ_η=1e-10)
     
     (; du, dv) = mem
     
diff --git a/src/garray.jl b/src/garray.jl
index 07648185..66074419 100644
--- a/src/garray.jl
+++ b/src/garray.jl
@@ -39,7 +39,7 @@ function setindex!(a::GArray{PGrid{G1, G2, d}, T}, v, i::Int, j::Int) where G1 w
     nothing
 end
 
-
+getprecision(g::GArray{G,T}) where G where T = eltype(g).types[1].types[1]
 
 
 eltype(g::GArray{G,T}) where G where T = eltype(T)
@@ -142,3 +142,9 @@ function Base.convert(::Type{Matrix}, A::GArray{G,Vector{T}}) where G where T <:
     end
     return M
 end
+
+
+
+### TODO
+
+duplicate(g::GArray{G,T}) where G where T = GArray(g.grid, deepcopy(g.data))
\ No newline at end of file
diff --git a/src/grids.jl b/src/grids.jl
index c4062159..0200e742 100644
--- a/src/grids.jl
+++ b/src/grids.jl
@@ -111,6 +111,7 @@ end
 @inline to__linear_index(g::PGrid, ind::Tuple{Int64, Int64}) =  ind[1] + length(g.grids[1])*(ind[2]-1)
 
 
+size(g::PGrid) = tuple((length(g) for g in g.grids)...)
 show(io::IO, g::SGrid{d1, d2}) where d1 where d2 = print(io, "SGrid{$(d1)}")
 
 show(io::IO, g::CGrid{d}) where d = let 
diff --git a/src/splines/interp.jl b/src/splines/interp.jl
index c68a8f1d..725b38ce 100644
--- a/src/splines/interp.jl
+++ b/src/splines/interp.jl
@@ -1,8 +1,8 @@
 using StaticArrays
-import LoopVectorization: VectorizationBase
 import Base: getindex
 
-getindex(A::Vector{Tf}, i::VectorizationBase.Vec{4,Int64}) where Tf = VectorizationBase.Vec{4, Tf}(A[i(1)], A[i(2)], A[i(3)], A[i(4)])
+# import LoopVectorization: VectorizationBase
+# getindex(A::Vector{Tf}, i::VectorizationBase.Vec{4,Int64}) where Tf = VectorizationBase.Vec{4, Tf}(A[i(1)], A[i(2)], A[i(3)], A[i(4)])
 
 
 # ## TODO : rewrite the following
@@ -13,7 +13,7 @@ getindex(A::Vector{Tf}, i::VectorizationBase.Vec{4,Int64}) where Tf = Vectorizat
     return sum( xx .* v[:])
 end
 
-@inline function reduce_tensors(λ::SVector{2,U}, M::SArray{T,V,2,W}) where d1 where d2 where T where U where V where W
+@inline function reduce_tensors(λ::SVector{2,U}, M::SArray{T,V,2,W})  where T where U where V where W
     v1 = SVector(1-λ[1],λ[1])
     v2 = SVector(1-λ[2],λ[2])
     return M[1,1]*v1[1]*v2[1] + M[1,2]*v1[1]*v2[2] + M[2,1]*v1[2]*v2[1] + M[2,2]*v1[2]*v2[2]
diff --git a/src/splines/macros.jl b/src/splines/macros.jl
index 2629ae3f..5e354d23 100644
--- a/src/splines/macros.jl
+++ b/src/splines/macros.jl
@@ -160,11 +160,12 @@ function create_local_parameters(d; vectorize=true, Tf=Float64)
     lines = []
     if vectorize
         for i=1:d
+
             bl = quote
                 $(U("x",i)) = S[n][$i]
                 $(U("u",i)) = ($(U("x",i)) - $(U("start",i)))*$(U("dinv",i))
                 # $(U("i",i)) = (floor(Int,$(U("u",i)) ))
-                $(U("i",i)) = unsafe_trunc(Int32, $(U("u",i)))
+                $(U("i",i)) = unsafe_trunc(Int, $(U("u",i)))
                 $(U("i",i)) = max( min($(U("i",i)),$(U("M",i))-2), 0 )
                 $(U("t",i)) = $(U("u",i))-$(U("i",i))
                 $(U("tp",i,1)) = $(U("t",i))*$(U("t",i))*$(U("t",i))
@@ -178,14 +179,29 @@ function create_local_parameters(d; vectorize=true, Tf=Float64)
         end
     else
         lines = []
+        e = quote
+                x_ = SVector(S...)
+                a_ = SVector( a... )
+                b_ = SVector( b... )
+                n_ = SVector( orders... )
+                δ_ = (b_.-a_)./(n_.-1)
+                i_ = div.( (x_.-a_), δ_)
+                i_ = max.(min.(i_, n_.-2), 0)
+                λ_ = (x_.-(a_ .+ δ_.*i_))./δ_
+                ii_ = unsafe_trunc.(Int,i_)
+
+            end
+        for x in e.args
+            push!(lines, x)
+        end
         for i=1:d
             bl = quote
-                $(U("x",i)) = S[$i]
-                $(U("u",i)) = ($(U("x",i)) - $(U("start",i)))*$(U("dinv",i))
-                # $(U("i",i)) = (floor(Int,$(U("u",i)) ))
-                $(U("i",i)) = unsafe_trunc(Int32, $(U("u",i)) )
-                $(U("i",i)) = max( min($(U("i",i)),$(U("M",i))-2), 0 )
-                $(U("t",i)) = $(U("u",i))-$(U("i",i))
+
+
+                # compat
+                $(U("i",i)) = ii_[$i]
+                $(U("t",i)) = λ_[$i]
+
                 $(U("tp",i,1)) = $(U("t",i))*$(U("t",i))*$(U("t",i))
                 $(U("tp",i,2)) = $(U("t",i))*$(U("t",i))
                 $(U("tp",i,3)) = $(U("t",i))

From 2f6d73ec67f8ce6ca550b694a7bf05403c060fbe Mon Sep 17 00:00:00 2001
From: Pablo Winant <pablo.winant@gmail.com>
Date: Sun, 2 Jun 2024 00:01:43 +0200
Subject: [PATCH 10/17] WIP

---
 Project.toml                   |   2 +-
 misc/dev_float32.jl            |  86 ++++++++++++--------
 src/splines/cubic_prefilter.jl | 144 ++++++++++++++++++++++++---------
 src/splines/splines.jl         |   2 +-
 4 files changed, 160 insertions(+), 74 deletions(-)

diff --git a/Project.toml b/Project.toml
index 31e1a6b2..462d832c 100644
--- a/Project.toml
+++ b/Project.toml
@@ -20,6 +20,7 @@ FiniteDiff = "6a86dc24-6348-571c-b903-95158fe2bd41"
 FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
 Formatting = "59287772-0a20-5a39-b81b-1366585eb4c0"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
 IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
@@ -45,4 +46,3 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
-
diff --git a/misc/dev_float32.jl b/misc/dev_float32.jl
index 801bdb2f..719405f2 100644
--- a/misc/dev_float32.jl
+++ b/misc/dev_float32.jl
@@ -16,72 +16,92 @@ import oneAPI: oneArray
 import Adapt: adapt_structure
 # import CUDA: CuArray
 
+import Dolo
 
 wk0 = Dolo.time_iteration_workspace(dm32);
 
 # wk = Dolo.time_iteration_workspace(dm32, dest=CuArray)
-wk = Dolo.time_iteration_workspace(dm32, dest=oneArray);
+@time wk = Dolo.time_iteration_workspace(dm32, dest=oneArray; interp_mode=:cubic);
 
 
-wk = adapt(oneArray, wk0)
-# wk = adapt(CuArray, wk0)
+time_iteration(dm32, wk0; engine=:gpu, tol_η=1e-5, improve_wait=3, improve=true)
+time_iteration(dm32, wk; engine=:gpu, tol_η=1e-5, improve_wait=3, improve=true)
 
 
-using KernelAbstractions: get_backend
 
 
-t_e = get_backend(wk.x0)
 
-using KernelAbstractions: CPU
-using StaticArrays
+# list available options
 
+# @time wk = Dolo.time_iteration_workspace(dm32, dest=Array; interp_mode=:cubic);
 
-L0 = Dolo.dF_2(dm32, wk0.x0*0, wk0.φ)
-L0_gpu = adapt(oneArray, L0)
+# @time Dolo.splines.prefilter!(c0, Val(:CPU))
+# @time Dolo.splines.prefilter!(c0, Val(:Threads))
+# @time Dolo.splines.prefilter!(c0, Val(:KA))
 
-L = deepcopy(L0)
+# @time Dolo.splines.prefilter!(c0)
+# @time Dolo.splines.prefilter!(c1,Val(:KA))
 
-@time Dolo.dF_2!(L, dm32, wk0.x0, wk0.φ, CPU())
+# @time Dolo.splines.prefilter!(c,Val(:KA))
 
-# @time Dolo.dd_2!(L, dm32, wk0.x0, wk0.φ, CPU())
+# c_gpu = adapt(Array, c)
 
+# # Dolo.splines.prefilter!(c)
 
-@time Dolo.dF_2!(L0_gpu, dm32, wk.x0, wk.φ, t_e)
 
-Lg = adapt(Array, L0_gpu)
 
+# t_e = get_backend(wk.x0)
 
-@time L*wk0.x0;
+# using KernelAbstractions: CPU
+# using StaticArrays
 
-#It works !
-using BenchmarkTools
-@benchmark Dolo.mul!(wk0.r0, L, wk0.x0, CPU())
-@benchmark Dolo.mul!(wk.r0, L0_gpu, wk.x0, t_e)
-# try
 
-    # res = fun_(wk.r0, dm32, wk.x0, wk.φ; ndrange=(p,q))
-# catch err
-    # nothing
-# end
+# L0 = Dolo.dF_2(dm32, wk0.x0*0, wk0.φ)
+# L0_gpu = adapt(oneArray, L0)
 
-wk0 = Dolo.time_iteration_workspace(dm32; interp_mode=:cubic)
-# wk = adapt(CuArray, wk0)
-wk = adapt(oneArray, wk0)
+# L = deepcopy(L0)
 
-@time Dolo.time_iteration(dm32, wk, verbose=true, improve=true; engine=:gpu, improve_wait=10, improve_K=500, T=20);
+# @time Dolo.dF_2!(L, dm32, wk0.x0, wk0.φ, CPU())
 
+# # @time Dolo.dd_2!(L, dm32, wk0.x0, wk0.φ, CPU())
 
 
+# @time Dolo.dF_2!(L0_gpu, dm32, wk.x0, wk.φ, t_e)
 
-@time Dolo.time_iteration(dm32, wk0, verbose=true, improve=false;  T=20);
+# Lg = adapt(Array, L0_gpu)
 
 
+# @time L*wk0.x0;
 
+# #It works !
+# using BenchmarkTools
+# @benchmark Dolo.mul!(wk0.r0, L, wk0.x0, CPU())
+# @benchmark Dolo.mul!(wk.r0, L0_gpu, wk.x0, t_e)
+# # try
 
-using ForwardDiff
+#     # res = fun_(wk.r0, dm32, wk.x0, wk.φ; ndrange=(p,q))
+# # catch err
+#     # nothing
+# # end
 
-ForwardDiff.derivative(u->unsafe_trunc(u), 0.1)
+# wk0 = Dolo.time_iteration_workspace(dm32; interp_mode=:cubic)
+# # wk = adapt(CuArray, wk0)
+# wk = adapt(oneArray, wk0)
 
-ForwardDiff.jacobian(u->trunc.(Int, u), SVector(0.3, 0.2))
+# @time Dolo.time_iteration(dm32, wk, verbose=true, improve=true; engine=:gpu, improve_wait=10, improve_K=500, T=20);
 
-ForwardDiff.jacobian(u->unsafe_trunc.(Int, u), SVector(0.3, 0.2))
+
+
+
+# @time Dolo.time_iteration(dm32, wk0, verbose=true, improve=false;  T=20);
+
+
+
+
+# using ForwardDiff
+
+# ForwardDiff.derivative(u->unsafe_trunc(u), 0.1)
+
+# ForwardDiff.jacobian(u->trunc.(Int, u), SVector(0.3, 0.2))
+
+# ForwardDiff.jacobian(u->unsafe_trunc.(Int, u), SVector(0.3, 0.2))
diff --git a/src/splines/cubic_prefilter.jl b/src/splines/cubic_prefilter.jl
index 34c20319..e6ff2f74 100644
--- a/src/splines/cubic_prefilter.jl
+++ b/src/splines/cubic_prefilter.jl
@@ -87,12 +87,19 @@ end
 
 # end
 
-const zq = sqrt(3.0)-2.0
-const λ = 6.0
+const zq_0 = sqrt(3.0)-2.0
+const λ_0 = 6.0
 
+getprecision(g::AbstractArray{T}) where T<:Number = T
+getprecision(g::AbstractArray{T}) where T<:SVector = eltype(g).types[1].types[1]
 
 
 function prefilter!(c::AbstractVector{T}) where T
+
+
+    prec = getprecision(c)
+    λ = convert(prec, λ_0)
+    zq = convert(prec, zq_0)
     
     N = length(c)
     Horizon = (min(20, N-2))
@@ -101,7 +108,7 @@ function prefilter!(c::AbstractVector{T}) where T
     ff = c[N-1]
 
     t1 = zero(eltype(c)) #0.0
-    zn = 1.0
+    zn = convert(prec,1.0)
     for n in 0:Horizon
         t1 += λ*(2*f1-c[2+n])*zn
         zn *= zq 
@@ -126,59 +133,41 @@ function prefilter!(c::AbstractVector{T}) where T
 end
 
 
+function prefilter!(data::AbstractArray{T,2}, ::Val{:CPU}) where T
 
-# function prefilter!(data::AbstractVector{T}) where T
-
-#     # TODO: find a way to avoid allocation
-
-#     # data is 1d array of length M
-#     # data is 1d array of length M+2
-#     M = length(data) - 2
-#     bands = zeros(M+2, 3)
-    
-#     bb = zeros(T, M+2)
-
-#     # fill_bands!(M, bands, bb, data)
-#     prefilter!(data, bands, bb)
-
-# end
-
-# function prefilter!(data, bands, bb)
-
-#     M = length(data) - 2
-#     fill_bands!(M, bands, bb, data)
-#     solve_coefficients!(bands, bb, data)
-
-# end
-
+    I,J = size(data)
 
+    M = J-2
+    for i=1:I
+        dat = view(data, i, :)
+        prefilter!(dat)
+    end
 
-# function prefilter!(data::Array{T,1}) where T
-#     M = size(data,1)-2
-#     bands = zeros(M+2,3)
-#     bb = zeros(T,M+2)
-#     fill_bands!(M, bands, bb, data)
-#     prefilter!(data, bands, bb)
-# end
+    M = I-2
+    for j=1:J
+        dat = view(data, :, j)
+        prefilter!(dat)
+    end
+end
 
-function prefilter!(data::AbstractArray{T,2}) where T
+function prefilter!(data::AbstractArray{T,2}, ::Val{:Threads}) where T
 
     I,J = size(data)
 
     M = J-2
-    for i=1:I
+    Threads.@threads for i=1:I
         dat = view(data, i, :)
         prefilter!(dat)
     end
 
     M = I-2
-    for j=1:J
+    Threads.@threads for j=1:J
         dat = view(data, :, j)
         prefilter!(dat)
     end
 end
 
-function prefilter!(data::Array{T,3}) where T
+function prefilter!(data::AbstractArray{T,3}) where T
 
     I,J,K = size(data)
 
@@ -209,7 +198,7 @@ function prefilter!(data::Array{T,3}) where T
 end
 
 
-function prefilter!(data::Array{T,4}) where T
+function prefilter!(data::AbstractArray{T,4}) where T
     I,J,K,L = size(data)
 
     M = L-2
@@ -250,3 +239,80 @@ function prefilter!(data::Array{T,4}) where T
         end
     end
 end
+
+#### 
+
+function prefilter!(data::AbstractArray{T,1}, ::Val{:CPU}) where T
+
+    prefilter!(data)
+
+end
+
+using KernelAbstractions: @kernel
+
+
+function prefilter!(data::AbstractArray{T,1}, ::Val{:KA}) where T
+
+    @kernel function ker(m)
+
+        prefilter!(m)
+
+    end
+
+    backend = get_backend(data)
+
+    fun = ker(backend)
+
+    fun(data, ndrange=(1,))
+
+end
+
+using KernelAbstractions
+
+
+using KernelAbstractions: CPU
+
+
+function prefilter!(data::AbstractArray{T,2}, ::Val{:KA}) where T
+
+
+    @kernel function ker_1(m)
+
+        i = @index(Global, Linear)
+
+        dat = view(m, i, :)
+        prefilter!(dat)
+
+    end
+
+    @kernel function ker_2(m)
+
+        j = @index(Global, Linear)
+        
+        dat = view(m, :, j)
+        prefilter!(dat)
+
+    end
+
+
+    backend = get_backend(data)
+
+    if backend==CPU()
+        # Not clear how to set number of threads here
+        fun_1 = ker_1(backend, 8)
+        fun_2 = ker_2(backend, 8)
+    else
+        fun_1 = ker_1(backend)
+        fun_2 = ker_2(backend)
+    end
+    
+    I,J = size(data)
+
+    fun_1(data, ndrange=(I,))
+    fun_2(data, ndrange=(J,))
+
+end
+
+using oneAPI: oneArray
+
+prefilter!(data::oneArray) = prefilter!(data, Val(:KA))
\ No newline at end of file
diff --git a/src/splines/splines.jl b/src/splines/splines.jl
index deb4fec1..45620f29 100644
--- a/src/splines/splines.jl
+++ b/src/splines/splines.jl
@@ -95,7 +95,7 @@ end
             fill!(spl.θ, zero(eltype(spl.θ)))
             ind = tuple( (2:(e[3]+1) for e in spl.grid )...)
             spl.θ[ind...] .= rhs
-            splines.prefilter!(spl.θ)
+            prefilter!(spl.θ, Val(:KA))
         elseif k==1
             spl.θ .= rhs
         end

From 4a6d0ba1788e35e0f22192fbf2511d6e96e6bb18 Mon Sep 17 00:00:00 2001
From: Pablo Winant <pablo.winant@gmail.com>
Date: Sun, 2 Jun 2024 01:09:18 +0200
Subject: [PATCH 11/17] WIP

---
 misc/dev_float32.jl | 30 ++++++++++++++++++++++--------
 src/adapt.jl        |  8 ++++----
 2 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/misc/dev_float32.jl b/misc/dev_float32.jl
index 719405f2..28ad67dc 100644
--- a/misc/dev_float32.jl
+++ b/misc/dev_float32.jl
@@ -6,26 +6,40 @@ model_32 = include("$(root_dir)/misc/rbc_float32.jl")
 
 model32 = Dolo.convert_precision(Float32, model_32)
 
-dm32 = Dolo.discretize(model32, Dict(:endo=>[1000]) )
+dm32 = Dolo.discretize(model32, Dict(:endo=>[100000]) )
 
 typeof(dm32)
 
 
 using Adapt
-import oneAPI: oneArray
+# import oneAPI: oneArray
+import CUDA: CuArray
+# import Cu
 import Adapt: adapt_structure
-# import CUDA: CuArray
 
 import Dolo
+gpuArray = CuArray
+interp_mode = :linear
 
-wk0 = Dolo.time_iteration_workspace(dm32);
+wk0 = Dolo.time_iteration_workspace(dm32; interp_mode=interp_mode);
+wk1 = Dolo.time_iteration_workspace(dm32; interp_mode=interp_mode);
+wk_gpu = Dolo.time_iteration_workspace(dm32, dest=gpuArray; interp_mode=interp_mode);
 
-# wk = Dolo.time_iteration_workspace(dm32, dest=CuArray)
-@time wk = Dolo.time_iteration_workspace(dm32, dest=oneArray; interp_mode=:cubic);
+@time sol1 = time_iteration(dm32, wk0; engine=:nothing, tol_η=1e-5, verbose=true, improve_wait=0, improve=false);
+@time sol3 = time_iteration(dm32, wk1; engine=:cpu, tol_η=1e-5, verbose=true, improve_wait=0, improve=false);
+@time sol2 = time_iteration(dm32, wk_gpu; engine=:gpu, tol_η=1e-5, verbose=true, improve_wait=0, improve=false);
 
 
-time_iteration(dm32, wk0; engine=:gpu, tol_η=1e-5, improve_wait=3, improve=true)
-time_iteration(dm32, wk; engine=:gpu, tol_η=1e-5, improve_wait=3, improve=true)
+
+
+wk0 = Dolo.time_iteration_workspace(dm32; interp_mode=:cubic);
+wk1 = Dolo.time_iteration_workspace(dm32; interp_mode=:cubic);
+wk_gpu = Dolo.time_iteration_workspace(dm32, dest=gpuArray; interp_mode=:cubic);
+
+@time sol1 = time_iteration(dm32, wk0; engine=:nothing, tol_η=1e-5, verbose=true, improve_wait=10, improve_K=100,improve=true);
+@time sol3 = time_iteration(dm32, wk1; engine=:cpu, tol_η=1e-5, verbose=true, improve_wait=10, improve_K=100, improve=true);
+# that one stops early
+@time sol2 = time_iteration(dm32, wk_gpu; engine=:gpu, tol_η=1e-5, verbose=true, improve_wait=10, improve_K=100, improve=true);
 
 
 
diff --git a/src/adapt.jl b/src/adapt.jl
index 90702d08..c778e955 100644
--- a/src/adapt.jl
+++ b/src/adapt.jl
@@ -35,10 +35,10 @@ function adapt_structure(to, L::LL{G,D,F}) where G where D where F
     LL(L.grid, adapt(to, L.D), adapt(to, L.φ))
 end
 
-
+maxabs(u::Number, v::Number) = abs(max(u,v))
 # should it be merged with the general definition?
-# import CUDA: CuArray
-# distance(x::GVector{G, A}, y::GVector{G,A}) where G where A<:CuArray = Base.mapreduce(u->maximum(u), max, x.data-y.data)
+import CUDA: CuArray
+distance(x::GVector{G, A}, y::GVector{G,A}) where G where A<:CuArray = Base.mapreduce(u->maximum(abs.(u)), max, x.data-y.data)
 
 import oneAPI: oneArray
-distance(x::GVector{G, A}, y::GVector{G,A}) where G where A<:oneArray = Base.mapreduce(u->maximum(u), max, x.data-y.data)
\ No newline at end of file
+distance(x::GVector{G, A}, y::GVector{G,A}) where G where A<:oneArray = Base.mapreduce(u->maximum(abs.(u)), max, x.data-y.data)
\ No newline at end of file

From a3cc8e28c5218ee3a1bcf15e9d733864cafa134c Mon Sep 17 00:00:00 2001
From: Pablo Winant <pablo.winant@gmail.com>
Date: Wed, 5 Jun 2024 00:09:17 +0200
Subject: [PATCH 12/17] Helper function for 32bits conversion.

---
 examples/ymodels/rbc_mc.jl  | 13 +++---
 misc/dev_conversion.jl      | 92 +++++++++++++++++++++++++++++++++++++
 src/algos/simul.jl          |  1 -
 src/algos/time_iteration.jl |  7 +--
 src/conversion.jl           |  2 +-
 src/grids.jl                |  8 ++++
 6 files changed, 111 insertions(+), 12 deletions(-)
 create mode 100644 misc/dev_conversion.jl

diff --git a/examples/ymodels/rbc_mc.jl b/examples/ymodels/rbc_mc.jl
index c5ec190e..54b54637 100644
--- a/examples/ymodels/rbc_mc.jl
+++ b/examples/ymodels/rbc_mc.jl
@@ -8,8 +8,8 @@ model = let
 
     # calibrate some parameters
     β = 0.99
-    σ = 5
-    η = 1
+    σ = 5.0
+    η = 1.0
     δ = 0.025
     α = 0.33
     ρ = 0.8
@@ -55,7 +55,8 @@ model = let
 
 end
 
-function Dolo.transition(model::typeof(model), s::NamedTuple, x::NamedTuple, M::NamedTuple)
+
+function Dolo.transition(model::hypeof(model), s::NamedTuple, x::NamedTuple, M::NamedTuple)
     
     (;δ, ρ) = model.calibration
     
@@ -68,7 +69,7 @@ end
 
 
 
-function intermediate(model::typeof(model),s::NamedTuple, x::NamedTuple)
+function intermediate(model::hypeof(model), s::NamedTuple, x::NamedTuple)
     
     p = model.calibration
 
@@ -81,7 +82,7 @@ function intermediate(model::typeof(model),s::NamedTuple, x::NamedTuple)
 end
 
 
-function arbitrage(model::typeof(model), s::NamedTuple, x::NamedTuple, S::NamedTuple, X::NamedTuple)
+function arbitrage(model::hypeof(model), s::NamedTuple, x::NamedTuple, S::NamedTuple, X::NamedTuple)
 
     p = model.calibration
 
@@ -95,6 +96,4 @@ function arbitrage(model::typeof(model), s::NamedTuple, x::NamedTuple, S::NamedT
 
 end
 
-
-
 model
\ No newline at end of file
diff --git a/misc/dev_conversion.jl b/misc/dev_conversion.jl
new file mode 100644
index 00000000..7502e384
--- /dev/null
+++ b/misc/dev_conversion.jl
@@ -0,0 +1,92 @@
+using Dolo
+using Dolo: transition
+function hypeof(model)
+    typ = typeof(model)
+    otyp = eltype(model)
+    ntyp = Float32
+    s = replace(string(typ),string(otyp)=>"Float32")
+    gtyp = ( Meta.parse(s) )
+    return Union{eval(gtyp), typ}
+end
+
+# hypeof(model)
+root_dir = pkgdir(Dolo)
+
+# macro mdef(funtree)
+#     println(funtree)
+#     println(dump(funtree))
+#     println("\nDONE\n")
+#     return funtree
+# end
+
+model = include("$(root_dir)/examples/ymodels/rbc_mc.jl")
+
+
+# macro mdef(funtree)
+#     return funtree
+# end
+
+# @mdef function fun(a)
+#     a = (;x=1,y=2)
+#     (;x, y) = a
+#     return (x*y)
+# end
+
+
+
+model32 = Dolo.convert_precision(Float32, model)
+
+
+
+s_ = Dolo.calibrated(model32, :states)
+x = Dolo.calibrated(model32, :controls)
+m_ = Dolo.calibrated(model32, :exogenous)
+
+s = [m_;s_]
+
+dm32 = Dolo.discretize(model32)
+Dolo.time_iteration(dm32, verbose=true)
+
+Dolo.time_iteration(model32)
+
+Dolo.transition(model32, s,x,m_)
+
+
+
+function Dolo.transition(model::typeof(model), s::NamedTuple, x::NamedTuple, M::NamedTuple)
+    
+    (;δ, ρ) = model.calibration
+    
+    # Z = e.Z
+    K = s.k * (1-δ) + x.i
+
+    (;k=K,)  ## This is only the endogenous state
+
+end
+
+
+model_32 = Dolo.convert_precision(Float32, model)
+# inherit all methods
+transition(m::typeof(model_32), args...) = transition(model, args...)
+arbitrage(m::typeof(model_32), args...) = arbitrage(model, args...)
+
+Dolo.time_iteration(model_32)
+
+
+dm32 = Dolo.discretize(model_32, Dict(:endo=>[1000]) )
+
+typeof(dm32)
+
+using Adapt
+import oneAPI: oneArray
+
+gpuArray = oneArray
+interp_mode = :linear
+
+wk0 = Dolo.time_iteration_workspace(dm32; interp_mode=interp_mode);
+wk1 = Dolo.time_iteration_workspace(dm32; interp_mode=interp_mode);
+wk_gpu = Dolo.time_iteration_workspace(dm32, dest=gpuArray; interp_mode=interp_mode);
+
+@time sol1 = time_iteration(dm32, wk0; engine=:nothing, tol_η=1e-5, verbose=true, improve_wait=0, improve=false);
+@time sol3 = time_iteration(dm32, wk1; engine=:cpu, tol_η=1e-5, verbose=true, improve_wait=0, improve=false);
+@time sol2 = time_iteration(dm32, wk_gpu; engine=:gpu, tol_η=1e-5, verbose=true, improve_wait=0, improve=false);
diff --git a/src/algos/simul.jl b/src/algos/simul.jl
index d31bc495..8442be56 100644
--- a/src/algos/simul.jl
+++ b/src/algos/simul.jl
@@ -11,7 +11,6 @@ function τ(dmodel::Dolo.DYModel{M}, ss::T, a::SVector) where M<:Union{Dolo.YMod
 
     (i,_) = ss.loc
     s_ = ss.val
-    
 
     # TODO: replace following block by one nonallocating function
     Q = dmodel.dproc.Q
diff --git a/src/algos/time_iteration.jl b/src/algos/time_iteration.jl
index efce75fe..6e226284 100644
--- a/src/algos/time_iteration.jl
+++ b/src/algos/time_iteration.jl
@@ -22,18 +22,19 @@
 function F(dmodel::M, s::QP, x::SVector{d,T}, φ::Union{Policy, GArray, DFun}) where M where d where T
 
     r = zero(SVector{d,T})
+
     for (w,S) in τ(dmodel, s, x)
         r += w*arbitrage(dmodel,s,x,S,φ(S)) 
     end
+
     # TODO: why does the following allocate ?
     # strange: if reloaded it doesn't allocate anymore
     # r += sum(
     #      w*arbitrage(model,s,x,S,φ(S)) 
     #      for (w,S) in τ(model, s, x)
     # )
-    r
-    # r::SVector{d,T}
-    # r = complementarities(dmodel.model, s,x,r)
+    r::SVector{d,T}
+    r = complementarities(dmodel.model, s,x,r)
     r
 end
 
diff --git a/src/conversion.jl b/src/conversion.jl
index 10fea021..73e7429d 100644
--- a/src/conversion.jl
+++ b/src/conversion.jl
@@ -18,7 +18,7 @@ function convert_precision(T, model::Dolo.YModel)
 
 
     Dolo.YModel(
-        names,
+        Dolo.name(model),
         states,
         controls,
         exogenous, 
diff --git a/src/grids.jl b/src/grids.jl
index 0200e742..b4829e66 100644
--- a/src/grids.jl
+++ b/src/grids.jl
@@ -108,6 +108,14 @@ getindex(g::PGrid{G1, G2, d}, ::Colon, i::Int64) where G1 where G2 where d = g.g
     return i + p*(j-1)
 end
    
+@inline function getindex(g::PGrid{G1, G2, d}, z::Complex{Int64}) where G1<:SGrid{d1} where G2<:CGrid{d2} where d where d1 where d2
+    # Tf = eltype(g)
+    # SVector{d,Tf}(g.grids[1][i]..., g.grids[2][j]...)
+    i,j = from_linear(g, z.re)
+    Dolo.QP((i,j), SVector(g.grids[1][i]..., g.grids[2][j]...))
+end
+
+
 @inline to__linear_index(g::PGrid, ind::Tuple{Int64, Int64}) =  ind[1] + length(g.grids[1])*(ind[2]-1)
 
 

From 51525f698264a7539dc6229da715f07f58b26c39 Mon Sep 17 00:00:00 2001
From: Pablo Winant <pablo.winant@gmail.com>
Date: Wed, 5 Jun 2024 15:01:11 +0200
Subject: [PATCH 13/17] WIP

---
 Project.toml                   | 13 ++++++++-----
 examples/ymodels/rbc_mc.jl     |  2 +-
 src/adapt.jl                   |  6 ------
 src/algos/time_iteration.jl    |  1 -
 src/conversion.jl              | 10 ++++++++++
 src/splines/cubic_prefilter.jl |  3 ---
 6 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/Project.toml b/Project.toml
index 462d832c..979dfcb2 100644
--- a/Project.toml
+++ b/Project.toml
@@ -3,19 +3,16 @@ uuid = "9d24351c-2990-5e1b-a277-04c4b809c898"
 version = "0.5.0-alpha"
 
 [deps]
-AbstractDifferentiation = "c29ec348-61ec-40c8-8164-b8c60e9d9f3d"
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 AxisArrays = "39de3d68-74b9-583c-8d2d-e117c070f3a9"
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 BlockDiagonals = "0a1fb500-61f7-11e9-3c65-f5ef3456f9f0"
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Continuables = "79afa230-ca09-11e8-120b-5decf7bf5e25"
 Crayons = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 Dolang = "e5c7262c-e9d2-5620-ad8e-1af14eb8a8e3"
-Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 FiniteDiff = "6a86dc24-6348-571c-b903-95158fe2bd41"
 FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
 Formatting = "59287772-0a20-5a39-b81b-1366585eb4c0"
@@ -44,5 +41,11 @@ StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 StringDistances = "88034a9c-02f8-509d-84a9-84ec65e18404"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6"
-Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
-oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
+
+[extensions]
+DolooneAPIExt = "oneAPI"
+DoloCUDAExt = "CUDA"
+
+[weakdeps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
\ No newline at end of file
diff --git a/examples/ymodels/rbc_mc.jl b/examples/ymodels/rbc_mc.jl
index 54b54637..f16a2a52 100644
--- a/examples/ymodels/rbc_mc.jl
+++ b/examples/ymodels/rbc_mc.jl
@@ -1,6 +1,6 @@
 
 using StaticArrays
-import Dolo: transition, arbitrage, intermediate
+import Dolo: transition, arbitrage, intermediate, hypeof
 
 model = let 
 
diff --git a/src/adapt.jl b/src/adapt.jl
index c778e955..39873b6e 100644
--- a/src/adapt.jl
+++ b/src/adapt.jl
@@ -36,9 +36,3 @@ function adapt_structure(to, L::LL{G,D,F}) where G where D where F
 end
 
 maxabs(u::Number, v::Number) = abs(max(u,v))
-# should it be merged with the general definition?
-import CUDA: CuArray
-distance(x::GVector{G, A}, y::GVector{G,A}) where G where A<:CuArray = Base.mapreduce(u->maximum(abs.(u)), max, x.data-y.data)
-
-import oneAPI: oneArray
-distance(x::GVector{G, A}, y::GVector{G,A}) where G where A<:oneArray = Base.mapreduce(u->maximum(abs.(u)), max, x.data-y.data)
\ No newline at end of file
diff --git a/src/algos/time_iteration.jl b/src/algos/time_iteration.jl
index 6e226284..c9b361a9 100644
--- a/src/algos/time_iteration.jl
+++ b/src/algos/time_iteration.jl
@@ -176,7 +176,6 @@ function time_iteration(model::YModel; kwargs...)
     time_iteration(dmodel, wksp; kwargs2...)
 end
 
-using oneAPI: oneArray
 function time_iteration(model::DYModel,
     workspace=time_iteration_workspace(model);
     T=500,
diff --git a/src/conversion.jl b/src/conversion.jl
index 73e7429d..b4de4d41 100644
--- a/src/conversion.jl
+++ b/src/conversion.jl
@@ -37,3 +37,13 @@ end
 function convert_precision(Tnew, ps::Dolo.ProductSpace)
     Dolo.ProductSpace((convert_precision(Tnew, e) for e in ps.spaces)...)
 end
+
+
+function hypeof(model)
+    typ = typeof(model)
+    otyp = eltype(model)
+    ntyp = Float32
+    s = replace(string(typ),string(otyp)=>"Float32")
+    gtyp = ( Meta.parse(s) )
+    return Union{eval(gtyp), typ}
+end
\ No newline at end of file
diff --git a/src/splines/cubic_prefilter.jl b/src/splines/cubic_prefilter.jl
index e6ff2f74..f5988140 100644
--- a/src/splines/cubic_prefilter.jl
+++ b/src/splines/cubic_prefilter.jl
@@ -313,6 +313,3 @@ function prefilter!(data::AbstractArray{T,2}, ::Val{:KA}) where T
 
 end
 
-using oneAPI: oneArray
-
-prefilter!(data::oneArray) = prefilter!(data, Val(:KA))
\ No newline at end of file

From b8f8f2cef19c0c94761790521d17303dc03062a2 Mon Sep 17 00:00:00 2001
From: Pablo Winant <pablo.winant@gmail.com>
Date: Wed, 5 Jun 2024 15:02:29 +0200
Subject: [PATCH 14/17] Updated CI matrix.

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 8bd006b9..3d7921af 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -9,7 +9,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        julia-version: ['1.7','1.8']
+        julia-version: ['1.9','1.10']
         julia-arch: [x64]
         os: [ubuntu-latest, windows-latest, macOS-latest]
 

From 7d60c36ef1464cdc81ba4d249334c2eeca292422 Mon Sep 17 00:00:00 2001
From: Pablo Winant <pablo.winant@gmail.com>
Date: Wed, 5 Jun 2024 18:30:50 +0200
Subject: [PATCH 15/17] WIP

---
 src/splines/cubic_prefilter.jl | 89 ++++++++++++++++++++++++++++---
 src/splines/interp.jl          | 38 +++++++++++---
 src/splines/macros.jl          | 95 ++++++++++++++++++++++++----------
 test/test_interpolation.jl     |  2 +-
 4 files changed, 181 insertions(+), 43 deletions(-)

diff --git a/src/splines/cubic_prefilter.jl b/src/splines/cubic_prefilter.jl
index f5988140..7f7c9b7a 100644
--- a/src/splines/cubic_prefilter.jl
+++ b/src/splines/cubic_prefilter.jl
@@ -75,17 +75,29 @@ end
 
 # TODO: there are certainly some cases where we don't want MVectors there
 
-# function prefilter!(data::AbstractVector{T}) where T
+function prefilter!(data::AbstractVector{T}) where T
 
-#     N = length(data)
+    N = length(data)
 
-#     bands = zero(MMatrix{N, 3, Float64, N*3})
-#     bb = zero(MVector{N, T})
+    bands = zero(MMatrix{N, 3, Float64, N*3})
+    bb = zero(MVector{N, T})
 
-#     fill_bands!(N-2, bands, bb, data)
-#     solve_coefficients!(bands, bb, data)
+    fill_bands!(N-2, bands, bb, data)
+    solve_coefficients!(bands, bb, data)
 
-# end
+end
+
+function prefilter_1!(data::AbstractVector{T}) where T
+
+    N = length(data)
+
+    bands = zero(MMatrix{N, 3, Float64, N*3})
+    bb = zero(MVector{N, T})
+
+    fill_bands!(N-2, bands, bb, data)
+    solve_coefficients!(bands, bb, data)
+
+end
 
 const zq_0 = sqrt(3.0)-2.0
 const λ_0 = 6.0
@@ -94,10 +106,11 @@ getprecision(g::AbstractArray{T}) where T<:Number = T
 getprecision(g::AbstractArray{T}) where T<:SVector = eltype(g).types[1].types[1]
 
 
-function prefilter!(c::AbstractVector{T}) where T
+function prefilter_2!(c::AbstractVector{T}) where T
 
 
     prec = getprecision(c)
+
     λ = convert(prec, λ_0)
     zq = convert(prec, zq_0)
     
@@ -313,3 +326,63 @@ function prefilter!(data::AbstractArray{T,2}, ::Val{:KA}) where T
 
 end
 
+
+function prefilter!(data::AbstractArray{T,3}, ::Val{:KA}) where T
+
+
+    @kernel function ker_1(m)
+
+        c = @index(Global, Cartesian)
+        i = c.I[1]
+        j = c.I[2]
+
+        dat = view(m, i, j, :)
+        prefilter!(dat)
+
+    end
+
+    @kernel function ker_2(m)
+
+        c = @index(Global, Cartesian)
+        j = c.I[1]
+        k = c.I[2]
+        
+        dat = view(m, :, j, k)
+        prefilter!(dat)
+
+    end
+
+    @kernel function ker_3(m)
+
+
+        c = @index(Global, Cartesian)
+        i = c.I[1]
+        k = c.I[2]
+        
+        dat = view(m, i, :, k)
+        prefilter!(dat)
+
+    end
+
+
+    backend = get_backend(data)
+
+    if backend==CPU()
+        # Not clear how to set number of threads here
+        fun_1 = ker_1(backend, 8)
+        fun_2 = ker_2(backend, 8)
+        fun_3 = ker_3(backend, 8)
+    else
+        fun_1 = ker_1(backend)
+        fun_2 = ker_2(backend)
+        fun_3 = ker_3(backend)
+    end
+    
+    I,J,K = size(data)
+
+    fun_1(data, ndrange=(I,J))
+    fun_2(data, ndrange=(J,K))
+    fun_3(data, ndrange=(I,K))
+
+end
+
diff --git a/src/splines/interp.jl b/src/splines/interp.jl
index 725b38ce..c26356e0 100644
--- a/src/splines/interp.jl
+++ b/src/splines/interp.jl
@@ -16,20 +16,42 @@ end
 @inline function reduce_tensors(λ::SVector{2,U}, M::SArray{T,V,2,W})  where T where U where V where W
     v1 = SVector(1-λ[1],λ[1])
     v2 = SVector(1-λ[2],λ[2])
-    return M[1,1]*v1[1]*v2[1] + M[1,2]*v1[1]*v2[2] + M[2,1]*v1[2]*v2[1] + M[2,2]*v1[2]*v2[2]
+    return M[1,1]*v1[1]*v2[1] +
+           M[1,2]*v1[1]*v2[2] + 
+           M[2,1]*v1[2]*v2[1] + 
+           M[2,2]*v1[2]*v2[2]
+
+    # vv = tuple( (SVector(1-e, e) for e in λ)...) 
+    # xx = SVector( (prod(e) for e in Iterators.product(vv...))...)
+    # return sum( xx .* v[:])
+end
+
+@inline function reduce_tensors(λ::SVector{3,U}, M::SArray{T,V,3,W})  where T where U where V where W
+    v1 = SVector(1-λ[1],λ[1])
+    v2 = SVector(1-λ[2],λ[2])
+    v3 = SVector(1-λ[3],λ[3])
+    res =  M[1,1,1]*v1[1]*v2[1]*v3[1] +
+           M[2,1,1]*v1[2]*v2[1]*v3[1] +
+           M[1,2,1]*v1[1]*v2[2]*v3[1] +
+           M[2,2,1]*v1[2]*v2[2]*v3[1] +
+           M[1,1,2]*v1[1]*v2[1]*v3[2] +
+           M[2,1,2]*v1[2]*v2[1]*v3[2] +
+           M[1,2,2]*v1[1]*v2[2]*v3[2] +
+           M[2,2,2]*v1[2]*v2[2]*v3[2]
+    return res
     # vv = tuple( (SVector(1-e, e) for e in λ)...) 
     # xx = SVector( (prod(e) for e in Iterators.product(vv...))...)
     # return sum( xx .* v[:])
 end
 
 matextract(v::AbstractArray{T,3}, i,j,k) where T = SArray{Tuple{2, 2, 2}, T, 3, 8}(
-    v[i,  j  ,k],
-    v[i+1,j  ,k],
-    v[i  ,j+1,k],
-    v[i+1,j+1,k],
-    v[i  ,j  ,k+1],
-    v[i+1,j  ,k+1],
-    v[i  ,j+1,k+1],
+    v[  i,  j,k  ],
+    v[i+1,  j,k  ],
+    v[  i,j+1,k  ],
+    v[i+1,j+1,k  ],
+    v[  i,  j,k+1],
+    v[i+1,  j,k+1],
+    v[  i,j+1,k+1],
     v[i+1,j+1,k+1]
 )
 
diff --git a/src/splines/macros.jl b/src/splines/macros.jl
index 5e354d23..28065ccf 100644
--- a/src/splines/macros.jl
+++ b/src/splines/macros.jl
@@ -156,22 +156,21 @@ function create_parameters(d; Tf=Float64)
     return lines
 end
 
+
 function create_local_parameters(d; vectorize=true, Tf=Float64)
     lines = []
     if vectorize
         for i=1:d
-
             bl = quote
                 $(U("x",i)) = S[n][$i]
                 $(U("u",i)) = ($(U("x",i)) - $(U("start",i)))*$(U("dinv",i))
-                # $(U("i",i)) = (floor(Int,$(U("u",i)) ))
-                $(U("i",i)) = unsafe_trunc(Int, $(U("u",i)))
+                $(U("i",i)) = (floor(Int,$(U("u",i)) ))
                 $(U("i",i)) = max( min($(U("i",i)),$(U("M",i))-2), 0 )
                 $(U("t",i)) = $(U("u",i))-$(U("i",i))
                 $(U("tp",i,1)) = $(U("t",i))*$(U("t",i))*$(U("t",i))
                 $(U("tp",i,2)) = $(U("t",i))*$(U("t",i))
                 $(U("tp",i,3)) = $(U("t",i))
-                $(U("tp",i,4)) = convert($Tf,1.0);   #TODO
+                $(U("tp",i,4)) = 1.0;
             end
             for ll in bl.args
                 push!(lines, ll)
@@ -179,33 +178,17 @@ function create_local_parameters(d; vectorize=true, Tf=Float64)
         end
     else
         lines = []
-        e = quote
-                x_ = SVector(S...)
-                a_ = SVector( a... )
-                b_ = SVector( b... )
-                n_ = SVector( orders... )
-                δ_ = (b_.-a_)./(n_.-1)
-                i_ = div.( (x_.-a_), δ_)
-                i_ = max.(min.(i_, n_.-2), 0)
-                λ_ = (x_.-(a_ .+ δ_.*i_))./δ_
-                ii_ = unsafe_trunc.(Int,i_)
-
-            end
-        for x in e.args
-            push!(lines, x)
-        end
         for i=1:d
             bl = quote
-
-
-                # compat
-                $(U("i",i)) = ii_[$i]
-                $(U("t",i)) = λ_[$i]
-
+                $(U("x",i)) = S[$i]
+                $(U("u",i)) = ($(U("x",i)) - $(U("start",i)))*$(U("dinv",i))
+                $(U("i",i)) = (floor(Int,$(U("u",i)) ))
+                $(U("i",i)) = max( min($(U("i",i)),$(U("M",i))-2), 0 )
+                $(U("t",i)) = $(U("u",i))-$(U("i",i))
                 $(U("tp",i,1)) = $(U("t",i))*$(U("t",i))*$(U("t",i))
                 $(U("tp",i,2)) = $(U("t",i))*$(U("t",i))
                 $(U("tp",i,3)) = $(U("t",i))
-                $(U("tp",i,4)) = convert($Tf,1.0);
+                $(U("tp",i,4)) = 1.0;
             end
             for ll in bl.args
                 push!(lines, ll)
@@ -215,6 +198,66 @@ function create_local_parameters(d; vectorize=true, Tf=Float64)
     return lines
 end
 
+# function create_local_parameters(d; vectorize=true, Tf=Float64)
+#     lines = []
+#     if vectorize
+#         for i=1:d
+
+#             bl = quote
+#                 $(U("x",i)) = S[n][$i]
+#                 $(U("u",i)) = ($(U("x",i)) - $(U("start",i)))*$(U("dinv",i))
+#                 # $(U("i",i)) = (floor(Int,$(U("u",i)) ))
+#                 $(U("i",i)) = unsafe_trunc(Int, $(U("u",i)))
+#                 $(U("i",i)) = max( min($(U("i",i)),$(U("M",i))-2), 0 )
+#                 $(U("t",i)) = $(U("u",i))-$(U("i",i))
+#                 $(U("tp",i,1)) = $(U("t",i))*$(U("t",i))*$(U("t",i))
+#                 $(U("tp",i,2)) = $(U("t",i))*$(U("t",i))
+#                 $(U("tp",i,3)) = $(U("t",i))
+#                 $(U("tp",i,4)) = convert($Tf,1.0);   #TODO
+#             end
+#             for ll in bl.args
+#                 push!(lines, ll)
+#             end
+#         end
+#     else
+#         lines = []
+#         e = quote
+#                 x_ = SVector(S...)
+#                 a_ = SVector( a... )
+#                 b_ = SVector( b... )
+#                 n_ = SVector( orders... )
+#                 δ_ = (b_.-a_)./(n_.-1)
+#                 # i_ = div.( (x_.-a_), δ_)
+#                 i_ = SVector( ()...)
+#                 i_ = max.(min.(i_, n_.-2), 0)
+#                 λ_ = (x_.-(a_ .+ δ_.*i_))./δ_
+#                 ii_ = unsafe_trunc.(Int,i_)
+
+#             end
+#         for x in e.args
+#             push!(lines, x)
+#         end
+#         for i=1:d
+#             bl = quote
+
+
+#                 # compat
+#                 $(U("i",i)) = ii_[$i]
+#                 $(U("t",i)) = λ_[$i]
+
+#                 $(U("tp",i,1)) = $(U("t",i))*$(U("t",i))*$(U("t",i))
+#                 $(U("tp",i,2)) = $(U("t",i))*$(U("t",i))
+#                 $(U("tp",i,3)) = $(U("t",i))
+#                 $(U("tp",i,4)) = convert($Tf,1.0);
+#             end
+#             for ll in bl.args
+#                 push!(lines, ll)
+#             end
+#         end
+#     end
+#     return lines
+# end
+
 function create_function(d,extrap="natural"; vectorize=true, Tf=Float32)
     if vectorize
         expr = quote
diff --git a/test/test_interpolation.jl b/test/test_interpolation.jl
index 4ab3bd59..bf94c3dd 100644
--- a/test/test_interpolation.jl
+++ b/test/test_interpolation.jl
@@ -12,7 +12,7 @@ using StaticArrays
             NN = 20
 
             vars =  tuple( (Symbol("d$i") for i in 1:d)... )
-            pairs = (vars[i]=>[0.0, 1.0*i] for i in 1:d)
+            pairs = (vars[i]=>(0.0, 1.0*i) for i in 1:d)
             dvars = Dict( pairs )
             cs = Dolo.CSpace(;
                 pairs...

From 8b4c56dd470f41801bee80ed0498f8552d0a319b Mon Sep 17 00:00:00 2001
From: Pablo Winant <pablo.winant@gmail.com>
Date: Wed, 5 Jun 2024 18:39:57 +0200
Subject: [PATCH 16/17] Added extension.

---
 ext/DoloCUDAExt.jl   | 12 ++++++++++++
 ext/DolooneAPIExt.jl | 16 ++++++++++++++++
 2 files changed, 28 insertions(+)
 create mode 100644 ext/DoloCUDAExt.jl
 create mode 100644 ext/DolooneAPIExt.jl

diff --git a/ext/DoloCUDAExt.jl b/ext/DoloCUDAExt.jl
new file mode 100644
index 00000000..3624089c
--- /dev/null
+++ b/ext/DoloCUDAExt.jl
@@ -0,0 +1,12 @@
+module DoloCUDAExt
+
+    using CUDA
+    using Dolo
+# should it be merged with the general definition?
+    
+    import CUDA: CuArray
+    
+    Dolo.distance(x::GVector{G, A}, y::GVector{G,A}) where G where A<:CuArray = Base.mapreduce(u->maximum(abs.(u)), max, x.data-y.data)
+
+
+end
diff --git a/ext/DolooneAPIExt.jl b/ext/DolooneAPIExt.jl
new file mode 100644
index 00000000..95c2cd36
--- /dev/null
+++ b/ext/DolooneAPIExt.jl
@@ -0,0 +1,16 @@
+module DolooneAPIExt
+    
+    println("Loading oneAPI")
+    using oneAPI
+    import oneAPI: oneArray
+
+    using Dolo: distance
+
+    Dolo.distance(x::GVector{G, A}, y::GVector{G,A}) where G where A<:oneArray = Base.mapreduce(u->maximum(abs.(u)), max, x.data-y.data)
+
+    import Dolo.splines: prefilter!
+
+    prefilter!(data::oneArray) = prefilter!(data, Val(:KA))
+
+
+end

From 663d857991041662bab06ed55b3eb4cecd784538 Mon Sep 17 00:00:00 2001
From: Pablo Winant <pablo.winant@gmail.com>
Date: Wed, 5 Jun 2024 19:16:28 +0200
Subject: [PATCH 17/17] All tests pass.

---
 src/dolo_model.jl     |  4 +-
 src/garray.jl         |  2 +-
 src/space.jl          |  3 +-
 src/splines/macros.jl | 96 ++++++++++++-------------------------------
 4 files changed, 31 insertions(+), 74 deletions(-)

diff --git a/src/dolo_model.jl b/src/dolo_model.jl
index bd76af4a..6394e776 100644
--- a/src/dolo_model.jl
+++ b/src/dolo_model.jl
@@ -80,7 +80,9 @@ function discretize(model::YModel{<:VAR1}, d=Dict())
     n_s = length(Dolo.variables(model.states)) - d
     
     exo_grid = SGrid(dvar.Q)
-    endo_space = CartesianSpace{n_s, Dolo.variables(model.states)[d+1:end]}(
+    # TODO: simplify that call
+    Tf = eltype(model)
+    endo_space = CartesianSpace{n_s, Dolo.variables(model.states)[d+1:end],Tf}(
         model.states.min[d+1:end],
         model.states.max[d+1:end]
     )
diff --git a/src/garray.jl b/src/garray.jl
index 66074419..cca063ea 100644
--- a/src/garray.jl
+++ b/src/garray.jl
@@ -63,7 +63,7 @@ getindex(g::GArray{G,T}, i::Int64) where G where T = g.data[i]
 setindex!(g::GArray, x, i) = (g.data[i] = x)
 
 function setindex!(
-        g::GArray{Dolo.CGrid{2},Vector{T}},
+        g::GArray{<:Dolo.CGrid{2},Vector{T}},
         v::T,
         i::Int64, 
         j::Int64) where T
diff --git a/src/space.jl b/src/space.jl
index 33a0332d..34666da3 100644
--- a/src/space.jl
+++ b/src/space.jl
@@ -8,8 +8,7 @@ end
 
 const CSpace = CartesianSpace
 
-CartesianSpace(a::Tuple{Tf}, b::Tuple{Tf}) where Tf = CartesianSpace{length(a), (:x,), Tf}(a,b)
-CartesianSpace(a::Tuple{Tf, Tf}, b::Tuple{Tf, Tf}) where Tf = CartesianSpace{length(a), (:x_1, :x_2), Tf}(a,b)
+# CartesianSpace(a::NTuple{d,Tf}, b::NTuple{d, Tf}) where d where Tf = CartesianSpace{length(a), (:x,), Tf}(a,b)
 
 function CartesianSpace(kwargs::Pair{Symbol, Tuple{Tf, Tf}}...) where Tf
     
diff --git a/src/splines/macros.jl b/src/splines/macros.jl
index 28065ccf..c989ff82 100644
--- a/src/splines/macros.jl
+++ b/src/splines/macros.jl
@@ -109,7 +109,7 @@ function create_Phi(d, extrap, diff; Tf=Float64)
     return lines
 end
 
-function tensor_prod(symbs, inds, add_index=false)
+function tensor_prod(symbs, inds, add_index=false; Tf=Float64)
     if length(symbs)==0
         subscripts = [:($(U("i",i))+$(inds[i])) for i=1:length(inds)]
         if add_index
@@ -157,20 +157,23 @@ function create_parameters(d; Tf=Float64)
 end
 
 
+
 function create_local_parameters(d; vectorize=true, Tf=Float64)
     lines = []
     if vectorize
         for i=1:d
+
             bl = quote
                 $(U("x",i)) = S[n][$i]
                 $(U("u",i)) = ($(U("x",i)) - $(U("start",i)))*$(U("dinv",i))
-                $(U("i",i)) = (floor(Int,$(U("u",i)) ))
+                # $(U("i",i)) = (floor(Int,$(U("u",i)) ))
+                $(U("i",i)) = unsafe_trunc(Int, $(U("u",i)))
                 $(U("i",i)) = max( min($(U("i",i)),$(U("M",i))-2), 0 )
                 $(U("t",i)) = $(U("u",i))-$(U("i",i))
                 $(U("tp",i,1)) = $(U("t",i))*$(U("t",i))*$(U("t",i))
                 $(U("tp",i,2)) = $(U("t",i))*$(U("t",i))
                 $(U("tp",i,3)) = $(U("t",i))
-                $(U("tp",i,4)) = 1.0;
+                $(U("tp",i,4)) = convert($Tf,1.0);   #TODO
             end
             for ll in bl.args
                 push!(lines, ll)
@@ -178,17 +181,30 @@ function create_local_parameters(d; vectorize=true, Tf=Float64)
         end
     else
         lines = []
+        e = quote
+                x_ = SVector(S...)
+                a_ = SVector( a... )
+                b_ = SVector( b... )
+                n_ = SVector( orders... )
+                δ_ = (b_.-a_)./(n_.-1)
+                i_ = div.( (x_.-a_), δ_)
+                i_ = max.(min.(i_, n_.-2), 0)
+                λ_ = (x_.-(a_ .+ δ_.*i_))./δ_
+                ii_ = unsafe_trunc.(Int,i_)
+            end
+        for x in e.args
+            push!(lines, x)
+        end
         for i=1:d
             bl = quote
-                $(U("x",i)) = S[$i]
-                $(U("u",i)) = ($(U("x",i)) - $(U("start",i)))*$(U("dinv",i))
-                $(U("i",i)) = (floor(Int,$(U("u",i)) ))
-                $(U("i",i)) = max( min($(U("i",i)),$(U("M",i))-2), 0 )
-                $(U("t",i)) = $(U("u",i))-$(U("i",i))
+                # compat
+                $(U("i",i)) = ii_[$i]
+                $(U("t",i)) = λ_[$i]
+
                 $(U("tp",i,1)) = $(U("t",i))*$(U("t",i))*$(U("t",i))
                 $(U("tp",i,2)) = $(U("t",i))*$(U("t",i))
                 $(U("tp",i,3)) = $(U("t",i))
-                $(U("tp",i,4)) = 1.0;
+                $(U("tp",i,4)) = convert($Tf,1.0);
             end
             for ll in bl.args
                 push!(lines, ll)
@@ -198,67 +214,7 @@ function create_local_parameters(d; vectorize=true, Tf=Float64)
     return lines
 end
 
-# function create_local_parameters(d; vectorize=true, Tf=Float64)
-#     lines = []
-#     if vectorize
-#         for i=1:d
-
-#             bl = quote
-#                 $(U("x",i)) = S[n][$i]
-#                 $(U("u",i)) = ($(U("x",i)) - $(U("start",i)))*$(U("dinv",i))
-#                 # $(U("i",i)) = (floor(Int,$(U("u",i)) ))
-#                 $(U("i",i)) = unsafe_trunc(Int, $(U("u",i)))
-#                 $(U("i",i)) = max( min($(U("i",i)),$(U("M",i))-2), 0 )
-#                 $(U("t",i)) = $(U("u",i))-$(U("i",i))
-#                 $(U("tp",i,1)) = $(U("t",i))*$(U("t",i))*$(U("t",i))
-#                 $(U("tp",i,2)) = $(U("t",i))*$(U("t",i))
-#                 $(U("tp",i,3)) = $(U("t",i))
-#                 $(U("tp",i,4)) = convert($Tf,1.0);   #TODO
-#             end
-#             for ll in bl.args
-#                 push!(lines, ll)
-#             end
-#         end
-#     else
-#         lines = []
-#         e = quote
-#                 x_ = SVector(S...)
-#                 a_ = SVector( a... )
-#                 b_ = SVector( b... )
-#                 n_ = SVector( orders... )
-#                 δ_ = (b_.-a_)./(n_.-1)
-#                 # i_ = div.( (x_.-a_), δ_)
-#                 i_ = SVector( ()...)
-#                 i_ = max.(min.(i_, n_.-2), 0)
-#                 λ_ = (x_.-(a_ .+ δ_.*i_))./δ_
-#                 ii_ = unsafe_trunc.(Int,i_)
-
-#             end
-#         for x in e.args
-#             push!(lines, x)
-#         end
-#         for i=1:d
-#             bl = quote
-
-
-#                 # compat
-#                 $(U("i",i)) = ii_[$i]
-#                 $(U("t",i)) = λ_[$i]
-
-#                 $(U("tp",i,1)) = $(U("t",i))*$(U("t",i))*$(U("t",i))
-#                 $(U("tp",i,2)) = $(U("t",i))*$(U("t",i))
-#                 $(U("tp",i,3)) = $(U("t",i))
-#                 $(U("tp",i,4)) = convert($Tf,1.0);
-#             end
-#             for ll in bl.args
-#                 push!(lines, ll)
-#             end
-#         end
-#     end
-#     return lines
-# end
-
-function create_function(d,extrap="natural"; vectorize=true, Tf=Float32)
+function create_function(d,extrap="natural"; vectorize=true, Tf=Float64)
     if vectorize
         expr = quote
             function $(Symbol(string("eval_UC_spline!")))( a, b, orders, C::Array{T,d}, S::Vector{SVector{d,$Tf}}, V::Vector{T}) where d where T