From 9a673ca9a8bf24e6b8650359052546d6a774b0fd Mon Sep 17 00:00:00 2001
From: Pablo Winant <pablo.winant@gmail.com>
Date: Sun, 26 May 2024 16:25:49 +0200
Subject: [PATCH] WIP

---
 Project.toml                            |   1 +
 misc/dev_float32.jl                     |  92 ++++++-
 misc/test_gpu.jl                        |  14 +-
 src/Dolo.jl                             |   1 +
 src/adapt.jl                            |  36 +++
 src/algos/time_iteration.jl             |  20 +-
 src/algos/time_iteration_accelerated.jl |   6 +-
 src/grids.jl                            |  16 +-
 src/splines/csplines.jl                 |  12 +-
 src/splines/interp.jl                   |   3 +-
 src/splines/interp_misc.jl              | 347 ++++++++++++++++++++++++
 src/splines/macros.jl                   |  70 +++--
 12 files changed, 562 insertions(+), 56 deletions(-)
 create mode 100644 src/adapt.jl
 create mode 100644 src/splines/interp_misc.jl

diff --git a/Project.toml b/Project.toml
index 971d189..fbd4311 100644
--- a/Project.toml
+++ b/Project.toml
@@ -11,6 +11,7 @@ BlockDiagonals = "0a1fb500-61f7-11e9-3c65-f5ef3456f9f0"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Continuables = "79afa230-ca09-11e8-120b-5decf7bf5e25"
 Crayons = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
+Cthulhu = "f68482b8-f384-11e8-15f7-abe071a5a75f"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
diff --git a/misc/dev_float32.jl b/misc/dev_float32.jl
index f074c9d..45e3fc0 100644
--- a/misc/dev_float32.jl
+++ b/misc/dev_float32.jl
@@ -1,9 +1,95 @@
 using Dolo
 
 root_dir = pkgdir(Dolo)
-model32 = include("$(root_dir)/misc/rbc_float32.jl")
+model_32 = include("$(root_dir)/misc/rbc_float32.jl")
 
 
-dm32 = Dolo.discretize(model, Dict(:endo=>[10000000]) )
+model32 = Dolo.convert_precision(Float32, model_32)
 
-Dolo.convert
\ No newline at end of file
+dm32 = Dolo.discretize(model32, Dict(:endo=>[100]) )
+
+
+typeof(dm32)
+
+using Adapt
+
+
+wk0 = Dolo.time_iteration_workspace(dm32)
+
+import oneAPI: oneArray
+import Adapt: adapt_structure
+
+wk = adapt(oneArray, wk0)
+using KernelAbstractions: get_backend
+
+if typeof(dm32.grid)<:Dolo.CGrid
+    p = dm32.grid.ranges[1][3]
+    q = dm32.grid.ranges[2][3]
+else
+    p = length(dm32.grid.grids[1])
+    q = length(dm32.grid.grids[2])
+end
+
+t_e = get_backend(wk.x0)
+
+using KernelAbstractions
+using StaticArrays
+
+@kernel function ggg(r, dm, x, φ)
+# @kernel function fun(dm, r, x)
+
+
+    n = @index(Global, Linear)
+    ind = @index(Global, Cartesian)
+    (i,j) = ind.I
+
+
+    # k = length(dm.grid.grids[1])
+    # i_ = (n-1)÷k
+    # j_ = (n-1) - (i)*κ
+    # i = i_+1
+    # j = j_+1
+    
+    # TODO: special function here
+    s_ = dm.grid[n]
+    s = Dolo.QP((i,j), s_)
+
+    xx = x[n]
+
+    # (i,j) = @inline Dolo.from_linear(model.grid, n)    
+
+    rr = Dolo.F(dm, s, xx, φ)
+
+    r[n] = rr
+
+end
+
+
+
+fun_ = ggg(t_e)
+@time fun_(wk.r0, dm32, wk.x0, wk.φ; ndrange=(p,q))
+
+
+
+@time Dolo.F!(wk0.r0, dm32, wk0.x0, wk0.φ)
+
+# try
+
+    # res = fun_(wk.r0, dm32, wk.x0, wk.φ; ndrange=(p,q))
+# catch err
+    # nothing
+# end
+
+
+
+# Dolo.time_iteration(dm32, wk; engine=:gpu)
+
+
+s_0 = dm32.grid[10]
+s0 = Dolo.QP((2,2), s_0)
+xx0 = wk0.x0[10]
+
+
+fon(dm32, s0, xx0) = sum(w*S.val for (w,S) in Dolo.τ(dm32, s0, xx0))
+
+@code_warntype fon(dm32, s0, xx0)
\ No newline at end of file
diff --git a/misc/test_gpu.jl b/misc/test_gpu.jl
index c41c933..b9eaa03 100644
--- a/misc/test_gpu.jl
+++ b/misc/test_gpu.jl
@@ -107,12 +107,6 @@ spl = Dolo.SplineInterpolator{typeof(grid), typeof(C), 3}(grid, C)
 # end
 
 
-adapt_structure(to, s::Dolo.SplineInterpolator{G,C,k}) where G where C where k = let
-    θθ = adapt(to, s.θ)
-    CC = typeof(θθ)
-    Dolo.SplineInterpolator{G,CC,k}(s.grid, θθ)
-end
-
 
 gspl = adapt(CuArray, spl)
 
@@ -170,6 +164,14 @@ Dolo.calibrated(model, :controls)
 
 import Adapt: adapt_storage
 
+
+
+adapt_structure(to, s::Dolo.SplineInterpolator{G,C,k}) where G where C where k = let
+    θθ = adapt(to, s.θ)
+    CC = typeof(θθ)
+    Dolo.SplineInterpolator{G,CC,k}(s.grid, θθ)
+end
+
 function adapt_structure(to, v::Dolo.GVector{G,V}) where G where V
     data = adapt(to, v.data)
     Dolo.GVector{G, typeof(data)}(
diff --git a/src/Dolo.jl b/src/Dolo.jl
index 75ac7a0..b460bdc 100755
--- a/src/Dolo.jl
+++ b/src/Dolo.jl
@@ -83,6 +83,7 @@ module Dolo
     include("algos/vfi.jl")
 
 
+    include("adapt.jl")
     include("utils.jl")
 
     # function yaml_import(filename)
diff --git a/src/adapt.jl b/src/adapt.jl
new file mode 100644
index 0000000..10362ef
--- /dev/null
+++ b/src/adapt.jl
@@ -0,0 +1,36 @@
+import Adapt: adapt, adapt_structure
+
+adapt_structure(to, s::SplineInterpolator{G,C,k}) where G where C where k = let
+    θθ = adapt(to, s.θ)
+    CC = typeof(θθ)
+    SplineInterpolator{G,CC,k}(s.grid, θθ)
+end
+
+function adapt_structure(to, v::GVector{G,V}) where G where V
+    data = adapt(to, v.data)
+    GVector{G, typeof(data)}(
+        v.grid,
+        data
+    )
+end
+
+function adapt_structure(to, v::GArray{G,V}) where G where V
+    data = adapt(to, v.data)
+    GArray(
+        v.grid,
+        data
+    )
+end
+function adapt_structure(to, f::DFun{Dom, Gar, Itp, vars}) where Dom where Gar where Itp where vars
+    itp = adapt(to, f.itp)
+    values = adapt(to, f.values)
+    DFun{Dom, typeof(values), typeof(itp), vars}(
+        f.domain,
+        values,
+        itp
+    )
+end
+
+import KernelAbstractions: get_backend
+
+get_backend(g::GArray) = get_backend(g.data)
diff --git a/src/algos/time_iteration.jl b/src/algos/time_iteration.jl
index c8ec90c..f96a18b 100644
--- a/src/algos/time_iteration.jl
+++ b/src/algos/time_iteration.jl
@@ -126,9 +126,9 @@ end
 
 
 using LinearMaps
+using Adapt
 
-
-function time_iteration_workspace(dmodel; interp_mode=:linear)
+function time_iteration_workspace(dmodel; interp_mode=:linear, dest=Array)
 
     T = eltype(dmodel)
 
@@ -145,7 +145,10 @@ function time_iteration_workspace(dmodel; interp_mode=:linear)
     )
     vars = variables(dmodel.model.controls)
     φ = DFun(dmodel.model.states, x0, vars; interp_mode=interp_mode)
-    return (;x0, x1, x2, r0, dx, J, φ)
+
+    tt = (;x0, x1, x2, r0, dx, J, φ)
+
+    return adapt(dest, tt)
 
 end
 
@@ -199,16 +202,14 @@ function time_iteration(model::DYModel,
     convergence = false
     iterations = T
     
-    if engine==:cpu
-        t_engine = CPU()
-    elseif engine==:gpu
-        t_engine = GPU()
+    (;x0, x1, x2, dx, r0, J, φ) = workspace
+
+    if engine in (:cpu, :gpu)
+        t_engine = get_backend(workspace.x0)
     else
         t_engine = nothing
     end
 
-    (;x0, x1, x2, dx, r0, J, φ) = workspace
-
     ti_trace = trace ? IterationTrace(typeof(φ)[]) : nothing
 
     if improve
@@ -224,7 +225,6 @@ function time_iteration(model::DYModel,
         trace && push!(ti_trace.data, deepcopy(φ))
 
         F!(r0, model, x0, φ, t_engine)
-    
         # r0 = F(model, x0, φ)
 
         ε = norm(r0)
diff --git a/src/algos/time_iteration_accelerated.jl b/src/algos/time_iteration_accelerated.jl
index 1f085ef..a4ca41f 100644
--- a/src/algos/time_iteration_accelerated.jl
+++ b/src/algos/time_iteration_accelerated.jl
@@ -37,14 +37,12 @@ using KernelAbstractions
 # end
 
 # This is for a PGrid model only 
-function F!(r, model, x, φ, engine::Union{CPU, GPU})
+function F!(r, model, x, φ, engine)
 
     @kernel function FF_(r, @Const(model), @Const(x), @Const(φ))
 
-        # c = @index(Global, Cartesian)
-        # c = @index(Global, Cartesian)
+
         n = @index(Global, Linear)
-        # i,j = c.I
         (i,j) = Dolo.from_linear(model.grid, n)
 
         s_ = model.grid[n]
diff --git a/src/grids.jl b/src/grids.jl
index 07062c7..c406215 100644
--- a/src/grids.jl
+++ b/src/grids.jl
@@ -15,7 +15,7 @@ size(cg::CGrid{d}) where d = tuple((e[3] for e in cg.ranges)... )
 
 getindex(g::CGrid{d}, inds::Vararg{Int64,d}) where d = SVector{d}(
     ( 
-        ( g.ranges[i][1] + (g.ranges[i][2]-g.ranges[i][1])*( (inds[i]-1)/(g.ranges[i][3]-1)) )
+        ( g.ranges[i][1] + (g.ranges[i][2]-g.ranges[i][1])*( (inds[i]-1)/(g.ranges[i][3]-convert(eltype(eltype(g)),1))) )
         for i=1:d
     )
 )
@@ -85,13 +85,17 @@ cross(g1::SGrid{d1}, g2::CGrid{d2}) where d1 where d2 = PGrid(g1,g2)
 
 import Base: getindex
 
-from_linear(g::PGrid{G1, G2, d}, n) where G1 where G2 where d = let x=divrem(n-1, length(g.grids[1])); (x[2]+1, x[1]+1) end
+@inline from_linear(g::PGrid{G1, G2, d}, n::Int) where G1 where G2 where d = let x=divrem(n-1, length(g.grids[1])); (x[2]+1, x[1]+1) end
 
-getindex(g::PGrid{G1, G2, d}, n::Int) where G1 where G2 where d = getindex(g, from_linear(g, n)...)
+@inline getindex(g::PGrid{G1, G2, d}, n::Int) where G1 where G2 where d = getindex(g, from_linear(g, n)...)
 
-function getindex(g::PGrid{G1, G2, d}, i::Int64, j::Int64) where G1<:SGrid{d1} where G2<:CGrid{d2} where d where d1 where d2
-    Tf = eltype(g)
-    SVector{d,Tf}(g.grids[1][i]..., g.grids[2][j]...)
+# eltype(cg::CGrid{d,Tf}) where d where Tf = SVector{d, Tf}
+
+
+@inline function getindex(g::PGrid{G1, G2, d}, i::Int64, j::Int64) where G1<:SGrid{d1} where G2<:CGrid{d2} where d where d1 where d2
+    # Tf = eltype(g)
+    # SVector{d,Tf}(g.grids[1][i]..., g.grids[2][j]...)
+    SVector(g.grids[1][i]..., g.grids[2][j]...)
 end
 
 
diff --git a/src/splines/csplines.jl b/src/splines/csplines.jl
index 75a40a9..cdcf0ae 100644
--- a/src/splines/csplines.jl
+++ b/src/splines/csplines.jl
@@ -4,21 +4,21 @@ using StaticArrays
 
 
 # old style call
-function eval_UC_spline(a, b, orders, C::AbstractArray{Float64}, S::Matrix{Float64})
+function eval_UC_spline(a, b, orders, C::AbstractArray{Tf}, S::Matrix{Tf}) where Tf
     d,N = size(S)
 
     n_x = size(C,1)
     dims = tuple(size(C)[2:end]...)
     out = zeros(n_x,N)
-    CC = reshape(reinterpret(SVector{n_x,Float64},vec(C)),dims)
-    SS = reshape(reinterpret(SVector{d,Float64},vec(S)),(N,))
-    V  = reshape(reinterpret(SVector{n_x,Float64},vec(out)),(N,))
+    CC = reshape(reinterpret(SVector{n_x,Tf},vec(C)),dims)
+    SS = reshape(reinterpret(SVector{d,Tf},vec(S)),(N,))
+    V  = reshape(reinterpret(SVector{n_x,Tf},vec(out)),(N,))
     eval_UC_spline!(a, b, orders, CC, SS, V)
     return out
 end
 
 
-function eval_UC_spline(a, b, orders, C::AbstractArray{T, d}, S::Vector{SVector{d,Float64}}) where T where d
+function eval_UC_spline(a, b, orders, C::AbstractArray{T, d}, S::Vector{SVector{d,Tf}}) where T where d where Tf
     N = length(S)
     vals = zeros(T,N)
     eval_UC_spline!(a, b, orders, C, S, vals)
@@ -26,7 +26,7 @@ function eval_UC_spline(a, b, orders, C::AbstractArray{T, d}, S::Vector{SVector
 end
 
 
-@generated function eval_UC_spline!(a, b, orders, C::AbstractArray{T, d}, S::AbstractVector{SVector{d, Float64}}, V::AbstractVector{T}) where d where T
+@generated function eval_UC_spline!(a, b, orders, C::AbstractArray{T, d}, S::AbstractVector{SVector{d,Tf}}, V::AbstractVector{T}) where d where T where Tf
     # d = C.parameters[2]-1 # first dimension of C indexes the splines
     # the speed penalty of extrapolating points when iterating over point
     # seems very small so this is the default
diff --git a/src/splines/interp.jl b/src/splines/interp.jl
index 9b2abfb..c68a8f1 100644
--- a/src/splines/interp.jl
+++ b/src/splines/interp.jl
@@ -58,7 +58,8 @@ function interp(ranges::NTuple{d, Tuple{Tf, Tf, Int64}}, values::AbstractArray{T
 
     λ = (x.-(a .+ δ.*i))./δ
 
-    i_ = floor.(Int, i) .+ 1
+    # i_ = floor.(Int, i) .+ 1
+    i_ = unsafe_trunc.(Int, i) .+ 1
 
     M = matextract(values, i_...)
     
diff --git a/src/splines/interp_misc.jl b/src/splines/interp_misc.jl
new file mode 100644
index 0000000..8339b17
--- /dev/null
+++ b/src/splines/interp_misc.jl
@@ -0,0 +1,347 @@
+using StaticArrays
+import LoopVectorization: VectorizationBase
+import Base: getindex
+
+getindex(A::Vector{Float64}, i::VectorizationBase.Vec{4,Int64}) = VectorizationBase.Vec{4, Float64}(A[i(1)], A[i(2)], A[i(3)], A[i(4)])
+
+
+# ## TODO : rewrite the following
+# it is currently slower than the custom versions below
+
+@inline function reduce_tensors(λ::SVector{d1,U}, v::SArray{T,V,d2,W}) where d1 where d2 where T where U where V where W
+    vv = tuple( (SVector(1-e, e) for e in λ)...) 
+    xx = SVector( (prod(e) for e in Iterators.product(vv...))...)
+    return sum( xx .* v[:])
+end
+
+matextract(v::AbstractArray{T,3}, i,j,k) where T = SArray{Tuple{2, 2, 2}, T, 3, 8}(
+    v[i,  j  ,k],
+    v[i+1,j  ,k],
+    v[i  ,j+1,k],
+    v[i+1,j+1,k],
+    v[i  ,j  ,k+1],
+    v[i+1,j  ,k+1],
+    v[i  ,j+1,k+1],
+    v[i+1,j+1,k+1]
+)
+
+matextract(v::AbstractArray{T,2}, i, j) where T = SArray{Tuple{2,2}, T, 2, 4}(
+    v[i,j],
+    v[i+1,j],
+    v[i,j+1],
+    v[i+1,j+1]
+)
+matextract(v::AbstractArray{T,1}, i) where T = SArray{Tuple{2}, T, 1, 2}(
+    v[i],
+    v[i+1]
+)
+
+function interp(ranges::NTuple{d, Tuple{Float64, Float64, Int64}}, values::AbstractArray{T,d}, x::SVector{d, U}) where d where T where U
+    
+    a = SVector( (e[1] for e in ranges)... )
+    b = SVector( (e[2] for e in ranges)... )
+    n = SVector( (e[3] for e in ranges)... )
+
+    δ = (b.-a)./(n.-1)
+
+    i = div.( (x.-a), δ)
+
+    i = max.(min.(i, n.-2), 0)
+
+    λ = (x.-(a .+ δ.*i))./δ
+
+    i_ = floor.(Int, i) .+ 1
+
+    # inds = tuple( (SVector(e,e+1) for e in i_)...)
+    # return reduce_tensors( λ, values[inds...] )
+
+    return reduce_tensors( λ, matextract(values, i_...) )
+
+end
+
+function interp(ranges::NTuple{d, Tuple{Float64, Float64, Int64}}, values::AbstractArray{T,d}, x::Vararg{U}) where d where T where U
+    xx = SVector(x...)
+    interp(ranges, values, xx)
+end
+
+# @inline function reduce_tensors(λ::SVector{3,U}, v::SArray{T,Float64,3,V}) where d1 where d2 where T where U where V
+#     (1.0 - λ[3]) * (             
+#         (1.0 - λ[2])*(
+#             (1.0 - λ[1])*v[1,1,1] + λ[1]*v[2,1,1]
+#         ) + λ[2] * (
+#             (1.0 - λ[1])*v[1,2,1] + λ[1]*v[2,2,1]
+#         )
+#     )
+#     + λ[3] * (
+#         (1.0 - λ[2])*(
+#             (1.0 - λ[1])*v[1,1,2] + λ[1]*v[2,1,2]
+#         ) + λ[2] * (
+#             (1.0 - λ[1])*v[1,2,2] + λ[1]*v[2,2,2]
+#         )
+#     )
+# end
+
+
+
+
+# @inline function interp(ranges::Tuple{Tuple{Float64, Float64, Int64}}, values::AbstractArray{T}, x::U) where T where U
+# # function interp(ranges::Tuple{Tuple{Float64, Float64, Int64}}, values, x) where T where U 
+
+#     a = (ranges[1][1])
+#     b = (ranges[1][2])
+#     n = (ranges[1][3])
+
+#     δ = (b-a)/(n-1)
+
+#     i = div( (x-a), δ)
+
+#     i = max(min(i, n-2), 0)
+
+#     λ = (x-(a + δ*i))/δ
+
+#     i_ = floor(Int, i) + 1
+
+#     v0 = values[i_]
+#     v1 = values[i_+1]
+
+#     (1.0 - λ)*v0 + λ*v1
+
+# end
+
+# @inline function interp(ranges::Tuple{Tuple{Float64, Float64, Int64},Tuple{Float64, Float64, Int64}}, values::AbstractArray{T}, x_1::U, x_2::U) where T where U
+#     # function interp(ranges::Tuple{Tuple{Float64, Float64, Int64}}, values, x) where T where U 
+    
+#         a_1 = (ranges[1][1])
+#         b_1 = (ranges[1][2])
+#         n_1 = (ranges[1][3])
+    
+#         a_2 = (ranges[2][1])
+#         b_2 = (ranges[2][2])
+#         n_2 = (ranges[2][3])
+
+#         δ_1 = (b_1-a_1)/(n_1-1)
+#         δ_2 = (b_2-a_2)/(n_2-1)
+    
+#         i_1 = div( (x_1-a_1), δ_1)
+#         i_2 = div( (x_2-a_2), δ_2)
+    
+#         i_1 = max(min(i_1, n_1-2), 0)
+#         i_2 = max(min(i_2, n_2-2), 0)
+    
+#         λ_1 = (x_1-(a_1 + δ_1*i_1))/δ_1
+#         λ_2 = (x_2-(a_2 + δ_2*i_2))/δ_2
+    
+#         i_1_ = floor(Int, i_1) + 1
+#         i_2_ = floor(Int, i_2) + 1
+
+#         v00 = values[i_1_, i_2_]
+#         v01 = values[i_1_, i_2_+1]
+#         v10 = values[i_1_+1, i_2_]
+#         v11 = values[i_1_+1, i_2_+1]
+    
+#         res = (1.0 - λ_2)*(
+#             (1.0 - λ_1)*v00 + λ_1*v10
+#         ) + λ_2 * (
+#             (1.0 - λ_1)*v01 + λ_1*v11
+#         )
+    
+#         return res
+
+# end
+
+
+# @inline function interp(ranges::Tuple{
+#             Tuple{Float64, Float64, Int64},
+#             Tuple{Float64, Float64, Int64},
+#             Tuple{Float64, Float64, Int64}
+#         }, values::AbstractArray{T}, x_1::U, x_2::U, x_3::U) where T where U
+    
+#         a_1 = (ranges[1][1])
+#         b_1 = (ranges[1][2])
+#         n_1 = (ranges[1][3])
+    
+#         a_2 = (ranges[2][1])
+#         b_2 = (ranges[2][2])
+#         n_2 = (ranges[2][3])
+
+#         a_3 = (ranges[3][1])
+#         b_3 = (ranges[3][2])
+#         n_3 = (ranges[3][3])
+
+#         δ_1 = (b_1-a_1)/(n_1-1)
+#         δ_2 = (b_2-a_2)/(n_2-1)
+#         δ_3 = (b_3-a_3)/(n_3-1)
+    
+#         i_1 = div( (x_1-a_1), δ_1)
+#         i_2 = div( (x_2-a_2), δ_2)
+#         i_3 = div( (x_3-a_3), δ_3)
+
+
+#         i_1 = max(min(i_1, n_1-2), 0)
+#         i_2 = max(min(i_2, n_2-2), 0)
+#         i_3 = max(min(i_3, n_3-2), 0)
+    
+#         λ_1 = (x_1-(a_1 + δ_1*i_1))/δ_1
+#         λ_2 = (x_2-(a_2 + δ_2*i_2))/δ_2
+#         λ_3 = (x_3-(a_3 + δ_3*i_3))/δ_3
+    
+#         i_1_ = floor(Int, i_1) + 1
+#         i_2_ = floor(Int, i_2) + 1
+#         i_3_ = floor(Int, i_3) + 1
+
+#         v000 = values[i_1_,   i_2_,   i_3_]
+#         v001 = values[i_1_,   i_2_,   i_3_+1]
+#         v010 = values[i_1_,   i_2_+1, i_3_]
+#         v011 = values[i_1_,   i_2_+1, i_3_+1]
+#         v100 = values[i_1_+1, i_2_,   i_3_]
+#         v101 = values[i_1_+1, i_2_,   i_3_+1]
+#         v110 = values[i_1_+1, i_2_+1, i_3_]
+#         v111 = values[i_1_+1, i_2_+1, i_3_+1]
+
+
+#         res = (1.0 - λ_3) * (             
+#                 (1.0 - λ_2)*(
+#                     (1.0 - λ_1)*v000 + λ_1*v100
+#                 ) + λ_2 * (
+#                     (1.0 - λ_1)*v010 + λ_1*v110
+#                 )
+#             )
+#             + λ_3 * (
+#                 (1.0 - λ_2)*(
+#                     (1.0 - λ_1)*v001 + λ_1*v101
+#                 ) + λ_2 * (
+#                     (1.0 - λ_1)*v011 + λ_1*v111
+#                 )
+#             )
+
+#         return res
+
+# end
+
+
+function vecinterp_1(ranges, values::Vector{T}, x::Vector{<:Number}) where T
+    N = length(x)
+    out = zeros(T,N)
+    for n=1:N
+        out[n] = interp(ranges, values, x[n])
+    end
+    return out
+end
+
+
+function vecinterp_2(ranges, values::Vector{T}, x::Vector{<:Number}) where T
+    N = length(x)
+    out = zeros(T,N)
+    out .= (u->interp(ranges, values, u)).(x)
+    return out
+end
+
+using LoopVectorization
+
+function vecinterp_3(ranges, values::Vector{T}, x::Vector{<:Number}) where T
+    N = length(x)
+    out = zeros(T,N)
+    @simd for n=1:N
+        out[n] = interp(ranges, values, x[n])
+    end
+    return out
+end
+
+
+using LoopVectorization
+
+function vecinterp_4(ranges, values::Vector{T}, X::Vector{<:Number}) where T
+    
+    K = length(X)
+    out = zeros(T,K)
+
+    a = (ranges[1][1])
+    b = (ranges[1][2])
+    N = (ranges[1][3])
+
+    δ = (b-a)/(N-1)
+
+    @turbo for n=1:K
+
+        x = X[n]
+
+        i = div( (x-a), δ)
+
+        i = max(min(i, N-2), 0)
+
+        λ = (x-(a + δ*i))/δ
+
+        i_ = floor(Int, i) + 1
+
+        v0 = values[i_]
+        v1 = values[i_+1]
+        # v0 = getindex(values, i_)
+        # v1 = getindex(values, i_+1)
+
+        out[n] = (1-λ)*v0 + λ*v1
+    end
+    return out
+end
+
+function vecinterp_5(ranges, values::Vector{T}, x::Vector{<:Number}) where T
+    N = length(x)
+    out = zeros(T,N)
+    @turbo for n=1:N
+        ret = interp(ranges, values, x[n])
+        out[n] = ret
+    end
+    return out
+end
+
+
+
+
+
+# @code_warntype interp(ranges, values, 2.0)
+
+# xvec = [range(-0.1, 2.1; length=1000)...]
+# tvec = xvec .^ 2
+
+# yvec = [(interp(ranges, values, x)) for x in xvec]
+
+
+# using ForwardDiff
+
+# ForwardDiff.jacobian(u->[interp(ranges, values, u[1])], [0.3])
+
+# using Plots
+
+# plot(xvec, tvec)
+# plot!(xvec, yvec)
+
+# using LoopVectorization: VectorizationBase
+
+
+# a,b,N = ranges[1]
+# x = VectorizationBase.Vec{4, Float64}(4, 3, 2, 5)
+# δ = (b-a)/(N-1)
+
+# i = div( (x-a), δ)
+
+# i = max(min(i, N-2), 0)
+
+# λ = (x-(a + δ*i))/δ
+
+# i_ = floor(Int, i) + 1
+
+# v0 = values[i_]
+# v1 = values[i_+1]
+
+# out[n] = (1-λ)*v0 + λ*v1
+
+
+
+# import Base: getindex
+# getindex(A::Vector{T}, i::VectorizationBase.Vec{4,Int64}) where T where d = VectorizationBase.Vec{4, T}(A[i(1)], A[i(2)], A[i(3)], A[i(4)])
+
+# getindex(values, i_)
+
+
+# values[i_]
+
+# i_(1)
\ No newline at end of file
diff --git a/src/splines/macros.jl b/src/splines/macros.jl
index e0955ee..2629ae3 100644
--- a/src/splines/macros.jl
+++ b/src/splines/macros.jl
@@ -21,31 +21,59 @@ using StaticArrays
 #     0.0 0.0  1.0  0.0
 #  ]
 
-const Ad = [
+const Ad64 = [
    [-1.0/6.0  3.0/6.0 -3.0/6.0 1.0/6.0];
    [ 3.0/6.0 -6.0/6.0  0.0/6.0 4.0/6.0];
    [-3.0/6.0  3.0/6.0  3.0/6.0 1.0/6.0];
    [ 1.0/6.0  0.0/6.0  0.0/6.0 0.0/6.0]
 ]
 
-const dAd = [
+const dAd64 = [
    [ 0.0 -0.5  1.0 -0.5];
    [ 0.0  1.5 -2.0  0.0];
    [ 0.0 -1.5  1.0  0.5];
    [ 0.0  0.5  0.0  0.0]
 ]
 
-const d2Ad = [
+const d2Ad64 = [
    [ 0.0 0.0 -1.0  1.0];
    [ 0.0 0.0  3.0 -2.0];
    [ 0.0 0.0 -3.0  1.0];
    [ 0.0 0.0  1.0  0.0]
 ]
 
+const Ad32 = [
+   [-1.0f0/6.0f0  3.0f0/6.0f0 -3.0f0/6.0f0 1.0f0/6.0f0];
+   [ 3.0f0/6.0f0 -6.0f0/6.0f0  0.0f0/6.0f0 4.0f0/6.0f0];
+   [-3.0f0/6.0f0  3.0f0/6.0f0  3.0f0/6.0f0 1.0f0/6.0f0];
+   [ 1.0f0/6.0f0  0.0f0/6.0f0  0.0f0/6.0f0 0.0f0/6.0f0]
+]
+
+const dAd32 = [
+   [ 0.0f0 -0.5f0  1.0f0 -0.5f0];
+   [ 0.0f0  1.5f0 -2.0f0  0.0f0];
+   [ 0.0f0 -1.5f0  1.0f0  0.5f0];
+   [ 0.0f0  0.5f0  0.0f0  0.0f0]
+]
+
+const d2Ad32 = [
+   [ 0.0f0 0.0f0 -1.0f0  1.0f0];
+   [ 0.0f0 0.0f0  3.0f0 -2.0f0];
+   [ 0.0f0 0.0f0 -3.0f0  1.0f0];
+   [ 0.0f0 0.0f0  1.0f0  0.0f0]
+]
+
 U(s,i) = Symbol(string(s,i))
 U(s,i,j) = Symbol(string(s,i,"_",j))
 
-function create_Phi(d, extrap, diff)
+function create_Phi(d, extrap, diff; Tf=Float64)
+    if Tf==Float64
+        Ad = Ad64
+        dAd = dAd64
+    elseif Tf==Float32
+        Ad = Ad32
+        dAd = dAd32
+    end
     lines = []
     for i=1:d
         block = []
@@ -113,13 +141,13 @@ tensor_prod(["Phi_1", "Phi_2"], Int64[])   # Phi_1_1 * (Phi_2_1 * C[i1 + 1,i2 +
 tensor_prod(["Phi_1", "dPhi_2"], Int64[])  # Phi_1_1 * (dPhi_2_1 * C[i1 + 1,i2 + 1] + dPhi_2_2 * C[i1 + 1,i2 + 2] + dPhi_2_3 * C[i1 + 1,i2 + 3] + dPhi_2_4 * C[i1 + 1,i2 + 4]) + Phi_1_2 * (dPhi_2_1 * C[i1 + 2,i2 + 1] + dPhi_2_2 * C[i1 + 2,i2 + 2] + dPhi_2_3 * C[i1 + 2,i2 + 3] + dPhi_2_4 * C[i1 + 2,i2 + 4]) + Phi_1_3 * (dPhi_2_1 * C[i1 + 3,i2 + 1] + dPhi_2_2 * C[i1 + 3,i2 + 2] + dPhi_2_3 * C[i1 + 3,i2 + 3] + dPhi_2_4 * C[i1 + 3,i2 + 4]) + Phi_1_4 * (dPhi_2_1 * C[i1 + 4,i2 + 1] + dPhi_2_2 * C[i1 + 4,i2 + 2] + dPhi_2_3 * C[i1 + 4,i2 + 3] + dPhi_2_4 * C[i1 + 4,i2 + 4])
 
 
-function create_parameters(d)
+function create_parameters(d; Tf=Float64)
     lines = []
     for i=1:d
         block = quote
             $(U("M",i)) = orders[$i]
             $(U("start",i)) =  a[$i]
-            $(U("dinv",i)) = (orders[$i]-1.0)/(b[$i]-a[$i])
+            $(U("dinv",i)) = (orders[$i]-convert($Tf,1.0))/(b[$i]-a[$i])
         end
         for ll in block.args
             push!(lines, ll)
@@ -128,20 +156,21 @@ function create_parameters(d)
     return lines
 end
 
-function create_local_parameters(d; vectorize=true)
+function create_local_parameters(d; vectorize=true, Tf=Float64)
     lines = []
     if vectorize
         for i=1:d
             bl = quote
                 $(U("x",i)) = S[n][$i]
                 $(U("u",i)) = ($(U("x",i)) - $(U("start",i)))*$(U("dinv",i))
-                $(U("i",i)) = (floor(Int,$(U("u",i)) ))
+                # $(U("i",i)) = (floor(Int,$(U("u",i)) ))
+                $(U("i",i)) = unsafe_trunc(Int32, $(U("u",i)))
                 $(U("i",i)) = max( min($(U("i",i)),$(U("M",i))-2), 0 )
                 $(U("t",i)) = $(U("u",i))-$(U("i",i))
                 $(U("tp",i,1)) = $(U("t",i))*$(U("t",i))*$(U("t",i))
                 $(U("tp",i,2)) = $(U("t",i))*$(U("t",i))
                 $(U("tp",i,3)) = $(U("t",i))
-                $(U("tp",i,4)) = 1.0;
+                $(U("tp",i,4)) = convert($Tf,1.0);   #TODO
             end
             for ll in bl.args
                 push!(lines, ll)
@@ -153,13 +182,14 @@ function create_local_parameters(d; vectorize=true)
             bl = quote
                 $(U("x",i)) = S[$i]
                 $(U("u",i)) = ($(U("x",i)) - $(U("start",i)))*$(U("dinv",i))
-                $(U("i",i)) = (floor(Int,$(U("u",i)) ))
+                # $(U("i",i)) = (floor(Int,$(U("u",i)) ))
+                $(U("i",i)) = unsafe_trunc(Int32, $(U("u",i)) )
                 $(U("i",i)) = max( min($(U("i",i)),$(U("M",i))-2), 0 )
                 $(U("t",i)) = $(U("u",i))-$(U("i",i))
                 $(U("tp",i,1)) = $(U("t",i))*$(U("t",i))*$(U("t",i))
                 $(U("tp",i,2)) = $(U("t",i))*$(U("t",i))
                 $(U("tp",i,3)) = $(U("t",i))
-                $(U("tp",i,4)) = 1.0;
+                $(U("tp",i,4)) = convert($Tf,1.0);
             end
             for ll in bl.args
                 push!(lines, ll)
@@ -169,17 +199,17 @@ function create_local_parameters(d; vectorize=true)
     return lines
 end
 
-function create_function(d,extrap="natural"; vectorize=true)
+function create_function(d,extrap="natural"; vectorize=true, Tf=Float32)
     if vectorize
         expr = quote
-            function $(Symbol(string("eval_UC_spline!")))( a, b, orders, C::Array{T,d}, S::Vector{SVector{d,Float64}}, V::Vector{T}) where d where T
-                $(create_parameters(d)...)
+            function $(Symbol(string("eval_UC_spline!")))( a, b, orders, C::Array{T,d}, S::Vector{SVector{d,$Tf}}, V::Vector{T}) where d where T
+                $(create_parameters(d; Tf=Tf)...)
                 N = size(S,1)
                 # @fastmath @inbounds @simd( # doesn't seem to make any difference
                 for n=1:N
-                    $(create_local_parameters(d)...)
-                    $(create_Phi(d,extrap,false)...)
-                    V[n] = $( tensor_prod([string("Phi_",i) for i=1:d], Int64[]) )
+                    $(create_local_parameters(d;Tf=Tf)...)
+                    $(create_Phi(d,extrap,false;Tf=Tf)...)
+                    V[n] = $( tensor_prod([string("Phi_",i) for i=1:d], Int64[]; Tf=Tf) )
                 end
                 # )
             end
@@ -187,11 +217,11 @@ function create_function(d,extrap="natural"; vectorize=true)
     else
         expr = quote
             function $(Symbol(string("eval_UC_spline")))( a, b, orders, C::Array{T,d}, S::SVector{d,U}) where d where T where U
-                $(create_parameters(d)...)
+                $(create_parameters(d,Tf=Tf)...)
                 N = size(S,1)
                 # @fastmath @inbounds @simd( # doesn't seem to make any difference
-                $(create_local_parameters(d; vectorize=false)...)
-                $(create_Phi(d,extrap,false)...)
+                $(create_local_parameters(d; vectorize=false,Tf=Tf)...)
+                $(create_Phi(d,extrap,false; Tf=Tf)...)
                 V = $( tensor_prod([string("Phi_",i) for i=1:d], Int64[]) )
                 # )
                 return V