From 9a673ca9a8bf24e6b8650359052546d6a774b0fd Mon Sep 17 00:00:00 2001 From: Pablo Winant Date: Sun, 26 May 2024 16:25:49 +0200 Subject: [PATCH] WIP --- Project.toml | 1 + misc/dev_float32.jl | 92 ++++++- misc/test_gpu.jl | 14 +- src/Dolo.jl | 1 + src/adapt.jl | 36 +++ src/algos/time_iteration.jl | 20 +- src/algos/time_iteration_accelerated.jl | 6 +- src/grids.jl | 16 +- src/splines/csplines.jl | 12 +- src/splines/interp.jl | 3 +- src/splines/interp_misc.jl | 347 ++++++++++++++++++++++++ src/splines/macros.jl | 70 +++-- 12 files changed, 562 insertions(+), 56 deletions(-) create mode 100644 src/adapt.jl create mode 100644 src/splines/interp_misc.jl diff --git a/Project.toml b/Project.toml index 971d189..fbd4311 100644 --- a/Project.toml +++ b/Project.toml @@ -11,6 +11,7 @@ BlockDiagonals = "0a1fb500-61f7-11e9-3c65-f5ef3456f9f0" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" Continuables = "79afa230-ca09-11e8-120b-5decf7bf5e25" Crayons = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f" +Cthulhu = "f68482b8-f384-11e8-15f7-abe071a5a75f" DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" diff --git a/misc/dev_float32.jl b/misc/dev_float32.jl index f074c9d..45e3fc0 100644 --- a/misc/dev_float32.jl +++ b/misc/dev_float32.jl @@ -1,9 +1,95 @@ using Dolo root_dir = pkgdir(Dolo) -model32 = include("$(root_dir)/misc/rbc_float32.jl") +model_32 = include("$(root_dir)/misc/rbc_float32.jl") -dm32 = Dolo.discretize(model, Dict(:endo=>[10000000]) ) +model32 = Dolo.convert_precision(Float32, model_32) -Dolo.convert \ No newline at end of file +dm32 = Dolo.discretize(model32, Dict(:endo=>[100]) ) + + +typeof(dm32) + +using Adapt + + +wk0 = Dolo.time_iteration_workspace(dm32) + +import oneAPI: oneArray +import Adapt: adapt_structure + +wk = adapt(oneArray, wk0) +using KernelAbstractions: get_backend + +if typeof(dm32.grid)<:Dolo.CGrid + p = dm32.grid.ranges[1][3] + q = dm32.grid.ranges[2][3] +else + p = length(dm32.grid.grids[1]) + q = length(dm32.grid.grids[2]) +end + +t_e = get_backend(wk.x0) + +using KernelAbstractions +using StaticArrays + +@kernel function ggg(r, dm, x, φ) +# @kernel function fun(dm, r, x) + + + n = @index(Global, Linear) + ind = @index(Global, Cartesian) + (i,j) = ind.I + + + # k = length(dm.grid.grids[1]) + # i_ = (n-1)÷k + # j_ = (n-1) - (i)*κ + # i = i_+1 + # j = j_+1 + + # TODO: special function here + s_ = dm.grid[n] + s = Dolo.QP((i,j), s_) + + xx = x[n] + + # (i,j) = @inline Dolo.from_linear(model.grid, n) + + rr = Dolo.F(dm, s, xx, φ) + + r[n] = rr + +end + + + +fun_ = ggg(t_e) +@time fun_(wk.r0, dm32, wk.x0, wk.φ; ndrange=(p,q)) + + + +@time Dolo.F!(wk0.r0, dm32, wk0.x0, wk0.φ) + +# try + + # res = fun_(wk.r0, dm32, wk.x0, wk.φ; ndrange=(p,q)) +# catch err + # nothing +# end + + + +# Dolo.time_iteration(dm32, wk; engine=:gpu) + + +s_0 = dm32.grid[10] +s0 = Dolo.QP((2,2), s_0) +xx0 = wk0.x0[10] + + +fon(dm32, s0, xx0) = sum(w*S.val for (w,S) in Dolo.τ(dm32, s0, xx0)) + +@code_warntype fon(dm32, s0, xx0) \ No newline at end of file diff --git a/misc/test_gpu.jl b/misc/test_gpu.jl index c41c933..b9eaa03 100644 --- a/misc/test_gpu.jl +++ b/misc/test_gpu.jl @@ -107,12 +107,6 @@ spl = Dolo.SplineInterpolator{typeof(grid), typeof(C), 3}(grid, C) # end -adapt_structure(to, s::Dolo.SplineInterpolator{G,C,k}) where G where C where k = let - θθ = adapt(to, s.θ) - CC = typeof(θθ) - Dolo.SplineInterpolator{G,CC,k}(s.grid, θθ) -end - gspl = adapt(CuArray, spl) @@ -170,6 +164,14 @@ Dolo.calibrated(model, :controls) import Adapt: adapt_storage + + +adapt_structure(to, s::Dolo.SplineInterpolator{G,C,k}) where G where C where k = let + θθ = adapt(to, s.θ) + CC = typeof(θθ) + Dolo.SplineInterpolator{G,CC,k}(s.grid, θθ) +end + function adapt_structure(to, v::Dolo.GVector{G,V}) where G where V data = adapt(to, v.data) Dolo.GVector{G, typeof(data)}( diff --git a/src/Dolo.jl b/src/Dolo.jl index 75ac7a0..b460bdc 100755 --- a/src/Dolo.jl +++ b/src/Dolo.jl @@ -83,6 +83,7 @@ module Dolo include("algos/vfi.jl") + include("adapt.jl") include("utils.jl") # function yaml_import(filename) diff --git a/src/adapt.jl b/src/adapt.jl new file mode 100644 index 0000000..10362ef --- /dev/null +++ b/src/adapt.jl @@ -0,0 +1,36 @@ +import Adapt: adapt, adapt_structure + +adapt_structure(to, s::SplineInterpolator{G,C,k}) where G where C where k = let + θθ = adapt(to, s.θ) + CC = typeof(θθ) + SplineInterpolator{G,CC,k}(s.grid, θθ) +end + +function adapt_structure(to, v::GVector{G,V}) where G where V + data = adapt(to, v.data) + GVector{G, typeof(data)}( + v.grid, + data + ) +end + +function adapt_structure(to, v::GArray{G,V}) where G where V + data = adapt(to, v.data) + GArray( + v.grid, + data + ) +end +function adapt_structure(to, f::DFun{Dom, Gar, Itp, vars}) where Dom where Gar where Itp where vars + itp = adapt(to, f.itp) + values = adapt(to, f.values) + DFun{Dom, typeof(values), typeof(itp), vars}( + f.domain, + values, + itp + ) +end + +import KernelAbstractions: get_backend + +get_backend(g::GArray) = get_backend(g.data) diff --git a/src/algos/time_iteration.jl b/src/algos/time_iteration.jl index c8ec90c..f96a18b 100644 --- a/src/algos/time_iteration.jl +++ b/src/algos/time_iteration.jl @@ -126,9 +126,9 @@ end using LinearMaps +using Adapt - -function time_iteration_workspace(dmodel; interp_mode=:linear) +function time_iteration_workspace(dmodel; interp_mode=:linear, dest=Array) T = eltype(dmodel) @@ -145,7 +145,10 @@ function time_iteration_workspace(dmodel; interp_mode=:linear) ) vars = variables(dmodel.model.controls) φ = DFun(dmodel.model.states, x0, vars; interp_mode=interp_mode) - return (;x0, x1, x2, r0, dx, J, φ) + + tt = (;x0, x1, x2, r0, dx, J, φ) + + return adapt(dest, tt) end @@ -199,16 +202,14 @@ function time_iteration(model::DYModel, convergence = false iterations = T - if engine==:cpu - t_engine = CPU() - elseif engine==:gpu - t_engine = GPU() + (;x0, x1, x2, dx, r0, J, φ) = workspace + + if engine in (:cpu, :gpu) + t_engine = get_backend(workspace.x0) else t_engine = nothing end - (;x0, x1, x2, dx, r0, J, φ) = workspace - ti_trace = trace ? IterationTrace(typeof(φ)[]) : nothing if improve @@ -224,7 +225,6 @@ function time_iteration(model::DYModel, trace && push!(ti_trace.data, deepcopy(φ)) F!(r0, model, x0, φ, t_engine) - # r0 = F(model, x0, φ) ε = norm(r0) diff --git a/src/algos/time_iteration_accelerated.jl b/src/algos/time_iteration_accelerated.jl index 1f085ef..a4ca41f 100644 --- a/src/algos/time_iteration_accelerated.jl +++ b/src/algos/time_iteration_accelerated.jl @@ -37,14 +37,12 @@ using KernelAbstractions # end # This is for a PGrid model only -function F!(r, model, x, φ, engine::Union{CPU, GPU}) +function F!(r, model, x, φ, engine) @kernel function FF_(r, @Const(model), @Const(x), @Const(φ)) - # c = @index(Global, Cartesian) - # c = @index(Global, Cartesian) + n = @index(Global, Linear) - # i,j = c.I (i,j) = Dolo.from_linear(model.grid, n) s_ = model.grid[n] diff --git a/src/grids.jl b/src/grids.jl index 07062c7..c406215 100644 --- a/src/grids.jl +++ b/src/grids.jl @@ -15,7 +15,7 @@ size(cg::CGrid{d}) where d = tuple((e[3] for e in cg.ranges)... ) getindex(g::CGrid{d}, inds::Vararg{Int64,d}) where d = SVector{d}( ( - ( g.ranges[i][1] + (g.ranges[i][2]-g.ranges[i][1])*( (inds[i]-1)/(g.ranges[i][3]-1)) ) + ( g.ranges[i][1] + (g.ranges[i][2]-g.ranges[i][1])*( (inds[i]-1)/(g.ranges[i][3]-convert(eltype(eltype(g)),1))) ) for i=1:d ) ) @@ -85,13 +85,17 @@ cross(g1::SGrid{d1}, g2::CGrid{d2}) where d1 where d2 = PGrid(g1,g2) import Base: getindex -from_linear(g::PGrid{G1, G2, d}, n) where G1 where G2 where d = let x=divrem(n-1, length(g.grids[1])); (x[2]+1, x[1]+1) end +@inline from_linear(g::PGrid{G1, G2, d}, n::Int) where G1 where G2 where d = let x=divrem(n-1, length(g.grids[1])); (x[2]+1, x[1]+1) end -getindex(g::PGrid{G1, G2, d}, n::Int) where G1 where G2 where d = getindex(g, from_linear(g, n)...) +@inline getindex(g::PGrid{G1, G2, d}, n::Int) where G1 where G2 where d = getindex(g, from_linear(g, n)...) -function getindex(g::PGrid{G1, G2, d}, i::Int64, j::Int64) where G1<:SGrid{d1} where G2<:CGrid{d2} where d where d1 where d2 - Tf = eltype(g) - SVector{d,Tf}(g.grids[1][i]..., g.grids[2][j]...) +# eltype(cg::CGrid{d,Tf}) where d where Tf = SVector{d, Tf} + + +@inline function getindex(g::PGrid{G1, G2, d}, i::Int64, j::Int64) where G1<:SGrid{d1} where G2<:CGrid{d2} where d where d1 where d2 + # Tf = eltype(g) + # SVector{d,Tf}(g.grids[1][i]..., g.grids[2][j]...) + SVector(g.grids[1][i]..., g.grids[2][j]...) end diff --git a/src/splines/csplines.jl b/src/splines/csplines.jl index 75a40a9..cdcf0ae 100644 --- a/src/splines/csplines.jl +++ b/src/splines/csplines.jl @@ -4,21 +4,21 @@ using StaticArrays # old style call -function eval_UC_spline(a, b, orders, C::AbstractArray{Float64}, S::Matrix{Float64}) +function eval_UC_spline(a, b, orders, C::AbstractArray{Tf}, S::Matrix{Tf}) where Tf d,N = size(S) n_x = size(C,1) dims = tuple(size(C)[2:end]...) out = zeros(n_x,N) - CC = reshape(reinterpret(SVector{n_x,Float64},vec(C)),dims) - SS = reshape(reinterpret(SVector{d,Float64},vec(S)),(N,)) - V = reshape(reinterpret(SVector{n_x,Float64},vec(out)),(N,)) + CC = reshape(reinterpret(SVector{n_x,Tf},vec(C)),dims) + SS = reshape(reinterpret(SVector{d,Tf},vec(S)),(N,)) + V = reshape(reinterpret(SVector{n_x,Tf},vec(out)),(N,)) eval_UC_spline!(a, b, orders, CC, SS, V) return out end -function eval_UC_spline(a, b, orders, C::AbstractArray{T, d}, S::Vector{SVector{d,Float64}}) where T where d +function eval_UC_spline(a, b, orders, C::AbstractArray{T, d}, S::Vector{SVector{d,Tf}}) where T where d where Tf N = length(S) vals = zeros(T,N) eval_UC_spline!(a, b, orders, C, S, vals) @@ -26,7 +26,7 @@ function eval_UC_spline(a, b, orders, C::AbstractArray{T, d}, S::Vector{SVector end -@generated function eval_UC_spline!(a, b, orders, C::AbstractArray{T, d}, S::AbstractVector{SVector{d, Float64}}, V::AbstractVector{T}) where d where T +@generated function eval_UC_spline!(a, b, orders, C::AbstractArray{T, d}, S::AbstractVector{SVector{d,Tf}}, V::AbstractVector{T}) where d where T where Tf # d = C.parameters[2]-1 # first dimension of C indexes the splines # the speed penalty of extrapolating points when iterating over point # seems very small so this is the default diff --git a/src/splines/interp.jl b/src/splines/interp.jl index 9b2abfb..c68a8f1 100644 --- a/src/splines/interp.jl +++ b/src/splines/interp.jl @@ -58,7 +58,8 @@ function interp(ranges::NTuple{d, Tuple{Tf, Tf, Int64}}, values::AbstractArray{T λ = (x.-(a .+ δ.*i))./δ - i_ = floor.(Int, i) .+ 1 + # i_ = floor.(Int, i) .+ 1 + i_ = unsafe_trunc.(Int, i) .+ 1 M = matextract(values, i_...) diff --git a/src/splines/interp_misc.jl b/src/splines/interp_misc.jl new file mode 100644 index 0000000..8339b17 --- /dev/null +++ b/src/splines/interp_misc.jl @@ -0,0 +1,347 @@ +using StaticArrays +import LoopVectorization: VectorizationBase +import Base: getindex + +getindex(A::Vector{Float64}, i::VectorizationBase.Vec{4,Int64}) = VectorizationBase.Vec{4, Float64}(A[i(1)], A[i(2)], A[i(3)], A[i(4)]) + + +# ## TODO : rewrite the following +# it is currently slower than the custom versions below + +@inline function reduce_tensors(λ::SVector{d1,U}, v::SArray{T,V,d2,W}) where d1 where d2 where T where U where V where W + vv = tuple( (SVector(1-e, e) for e in λ)...) + xx = SVector( (prod(e) for e in Iterators.product(vv...))...) + return sum( xx .* v[:]) +end + +matextract(v::AbstractArray{T,3}, i,j,k) where T = SArray{Tuple{2, 2, 2}, T, 3, 8}( + v[i, j ,k], + v[i+1,j ,k], + v[i ,j+1,k], + v[i+1,j+1,k], + v[i ,j ,k+1], + v[i+1,j ,k+1], + v[i ,j+1,k+1], + v[i+1,j+1,k+1] +) + +matextract(v::AbstractArray{T,2}, i, j) where T = SArray{Tuple{2,2}, T, 2, 4}( + v[i,j], + v[i+1,j], + v[i,j+1], + v[i+1,j+1] +) +matextract(v::AbstractArray{T,1}, i) where T = SArray{Tuple{2}, T, 1, 2}( + v[i], + v[i+1] +) + +function interp(ranges::NTuple{d, Tuple{Float64, Float64, Int64}}, values::AbstractArray{T,d}, x::SVector{d, U}) where d where T where U + + a = SVector( (e[1] for e in ranges)... ) + b = SVector( (e[2] for e in ranges)... ) + n = SVector( (e[3] for e in ranges)... ) + + δ = (b.-a)./(n.-1) + + i = div.( (x.-a), δ) + + i = max.(min.(i, n.-2), 0) + + λ = (x.-(a .+ δ.*i))./δ + + i_ = floor.(Int, i) .+ 1 + + # inds = tuple( (SVector(e,e+1) for e in i_)...) + # return reduce_tensors( λ, values[inds...] ) + + return reduce_tensors( λ, matextract(values, i_...) ) + +end + +function interp(ranges::NTuple{d, Tuple{Float64, Float64, Int64}}, values::AbstractArray{T,d}, x::Vararg{U}) where d where T where U + xx = SVector(x...) + interp(ranges, values, xx) +end + +# @inline function reduce_tensors(λ::SVector{3,U}, v::SArray{T,Float64,3,V}) where d1 where d2 where T where U where V +# (1.0 - λ[3]) * ( +# (1.0 - λ[2])*( +# (1.0 - λ[1])*v[1,1,1] + λ[1]*v[2,1,1] +# ) + λ[2] * ( +# (1.0 - λ[1])*v[1,2,1] + λ[1]*v[2,2,1] +# ) +# ) +# + λ[3] * ( +# (1.0 - λ[2])*( +# (1.0 - λ[1])*v[1,1,2] + λ[1]*v[2,1,2] +# ) + λ[2] * ( +# (1.0 - λ[1])*v[1,2,2] + λ[1]*v[2,2,2] +# ) +# ) +# end + + + + +# @inline function interp(ranges::Tuple{Tuple{Float64, Float64, Int64}}, values::AbstractArray{T}, x::U) where T where U +# # function interp(ranges::Tuple{Tuple{Float64, Float64, Int64}}, values, x) where T where U + +# a = (ranges[1][1]) +# b = (ranges[1][2]) +# n = (ranges[1][3]) + +# δ = (b-a)/(n-1) + +# i = div( (x-a), δ) + +# i = max(min(i, n-2), 0) + +# λ = (x-(a + δ*i))/δ + +# i_ = floor(Int, i) + 1 + +# v0 = values[i_] +# v1 = values[i_+1] + +# (1.0 - λ)*v0 + λ*v1 + +# end + +# @inline function interp(ranges::Tuple{Tuple{Float64, Float64, Int64},Tuple{Float64, Float64, Int64}}, values::AbstractArray{T}, x_1::U, x_2::U) where T where U +# # function interp(ranges::Tuple{Tuple{Float64, Float64, Int64}}, values, x) where T where U + +# a_1 = (ranges[1][1]) +# b_1 = (ranges[1][2]) +# n_1 = (ranges[1][3]) + +# a_2 = (ranges[2][1]) +# b_2 = (ranges[2][2]) +# n_2 = (ranges[2][3]) + +# δ_1 = (b_1-a_1)/(n_1-1) +# δ_2 = (b_2-a_2)/(n_2-1) + +# i_1 = div( (x_1-a_1), δ_1) +# i_2 = div( (x_2-a_2), δ_2) + +# i_1 = max(min(i_1, n_1-2), 0) +# i_2 = max(min(i_2, n_2-2), 0) + +# λ_1 = (x_1-(a_1 + δ_1*i_1))/δ_1 +# λ_2 = (x_2-(a_2 + δ_2*i_2))/δ_2 + +# i_1_ = floor(Int, i_1) + 1 +# i_2_ = floor(Int, i_2) + 1 + +# v00 = values[i_1_, i_2_] +# v01 = values[i_1_, i_2_+1] +# v10 = values[i_1_+1, i_2_] +# v11 = values[i_1_+1, i_2_+1] + +# res = (1.0 - λ_2)*( +# (1.0 - λ_1)*v00 + λ_1*v10 +# ) + λ_2 * ( +# (1.0 - λ_1)*v01 + λ_1*v11 +# ) + +# return res + +# end + + +# @inline function interp(ranges::Tuple{ +# Tuple{Float64, Float64, Int64}, +# Tuple{Float64, Float64, Int64}, +# Tuple{Float64, Float64, Int64} +# }, values::AbstractArray{T}, x_1::U, x_2::U, x_3::U) where T where U + +# a_1 = (ranges[1][1]) +# b_1 = (ranges[1][2]) +# n_1 = (ranges[1][3]) + +# a_2 = (ranges[2][1]) +# b_2 = (ranges[2][2]) +# n_2 = (ranges[2][3]) + +# a_3 = (ranges[3][1]) +# b_3 = (ranges[3][2]) +# n_3 = (ranges[3][3]) + +# δ_1 = (b_1-a_1)/(n_1-1) +# δ_2 = (b_2-a_2)/(n_2-1) +# δ_3 = (b_3-a_3)/(n_3-1) + +# i_1 = div( (x_1-a_1), δ_1) +# i_2 = div( (x_2-a_2), δ_2) +# i_3 = div( (x_3-a_3), δ_3) + + +# i_1 = max(min(i_1, n_1-2), 0) +# i_2 = max(min(i_2, n_2-2), 0) +# i_3 = max(min(i_3, n_3-2), 0) + +# λ_1 = (x_1-(a_1 + δ_1*i_1))/δ_1 +# λ_2 = (x_2-(a_2 + δ_2*i_2))/δ_2 +# λ_3 = (x_3-(a_3 + δ_3*i_3))/δ_3 + +# i_1_ = floor(Int, i_1) + 1 +# i_2_ = floor(Int, i_2) + 1 +# i_3_ = floor(Int, i_3) + 1 + +# v000 = values[i_1_, i_2_, i_3_] +# v001 = values[i_1_, i_2_, i_3_+1] +# v010 = values[i_1_, i_2_+1, i_3_] +# v011 = values[i_1_, i_2_+1, i_3_+1] +# v100 = values[i_1_+1, i_2_, i_3_] +# v101 = values[i_1_+1, i_2_, i_3_+1] +# v110 = values[i_1_+1, i_2_+1, i_3_] +# v111 = values[i_1_+1, i_2_+1, i_3_+1] + + +# res = (1.0 - λ_3) * ( +# (1.0 - λ_2)*( +# (1.0 - λ_1)*v000 + λ_1*v100 +# ) + λ_2 * ( +# (1.0 - λ_1)*v010 + λ_1*v110 +# ) +# ) +# + λ_3 * ( +# (1.0 - λ_2)*( +# (1.0 - λ_1)*v001 + λ_1*v101 +# ) + λ_2 * ( +# (1.0 - λ_1)*v011 + λ_1*v111 +# ) +# ) + +# return res + +# end + + +function vecinterp_1(ranges, values::Vector{T}, x::Vector{<:Number}) where T + N = length(x) + out = zeros(T,N) + for n=1:N + out[n] = interp(ranges, values, x[n]) + end + return out +end + + +function vecinterp_2(ranges, values::Vector{T}, x::Vector{<:Number}) where T + N = length(x) + out = zeros(T,N) + out .= (u->interp(ranges, values, u)).(x) + return out +end + +using LoopVectorization + +function vecinterp_3(ranges, values::Vector{T}, x::Vector{<:Number}) where T + N = length(x) + out = zeros(T,N) + @simd for n=1:N + out[n] = interp(ranges, values, x[n]) + end + return out +end + + +using LoopVectorization + +function vecinterp_4(ranges, values::Vector{T}, X::Vector{<:Number}) where T + + K = length(X) + out = zeros(T,K) + + a = (ranges[1][1]) + b = (ranges[1][2]) + N = (ranges[1][3]) + + δ = (b-a)/(N-1) + + @turbo for n=1:K + + x = X[n] + + i = div( (x-a), δ) + + i = max(min(i, N-2), 0) + + λ = (x-(a + δ*i))/δ + + i_ = floor(Int, i) + 1 + + v0 = values[i_] + v1 = values[i_+1] + # v0 = getindex(values, i_) + # v1 = getindex(values, i_+1) + + out[n] = (1-λ)*v0 + λ*v1 + end + return out +end + +function vecinterp_5(ranges, values::Vector{T}, x::Vector{<:Number}) where T + N = length(x) + out = zeros(T,N) + @turbo for n=1:N + ret = interp(ranges, values, x[n]) + out[n] = ret + end + return out +end + + + + + +# @code_warntype interp(ranges, values, 2.0) + +# xvec = [range(-0.1, 2.1; length=1000)...] +# tvec = xvec .^ 2 + +# yvec = [(interp(ranges, values, x)) for x in xvec] + + +# using ForwardDiff + +# ForwardDiff.jacobian(u->[interp(ranges, values, u[1])], [0.3]) + +# using Plots + +# plot(xvec, tvec) +# plot!(xvec, yvec) + +# using LoopVectorization: VectorizationBase + + +# a,b,N = ranges[1] +# x = VectorizationBase.Vec{4, Float64}(4, 3, 2, 5) +# δ = (b-a)/(N-1) + +# i = div( (x-a), δ) + +# i = max(min(i, N-2), 0) + +# λ = (x-(a + δ*i))/δ + +# i_ = floor(Int, i) + 1 + +# v0 = values[i_] +# v1 = values[i_+1] + +# out[n] = (1-λ)*v0 + λ*v1 + + + +# import Base: getindex +# getindex(A::Vector{T}, i::VectorizationBase.Vec{4,Int64}) where T where d = VectorizationBase.Vec{4, T}(A[i(1)], A[i(2)], A[i(3)], A[i(4)]) + +# getindex(values, i_) + + +# values[i_] + +# i_(1) \ No newline at end of file diff --git a/src/splines/macros.jl b/src/splines/macros.jl index e0955ee..2629ae3 100644 --- a/src/splines/macros.jl +++ b/src/splines/macros.jl @@ -21,31 +21,59 @@ using StaticArrays # 0.0 0.0 1.0 0.0 # ] -const Ad = [ +const Ad64 = [ [-1.0/6.0 3.0/6.0 -3.0/6.0 1.0/6.0]; [ 3.0/6.0 -6.0/6.0 0.0/6.0 4.0/6.0]; [-3.0/6.0 3.0/6.0 3.0/6.0 1.0/6.0]; [ 1.0/6.0 0.0/6.0 0.0/6.0 0.0/6.0] ] -const dAd = [ +const dAd64 = [ [ 0.0 -0.5 1.0 -0.5]; [ 0.0 1.5 -2.0 0.0]; [ 0.0 -1.5 1.0 0.5]; [ 0.0 0.5 0.0 0.0] ] -const d2Ad = [ +const d2Ad64 = [ [ 0.0 0.0 -1.0 1.0]; [ 0.0 0.0 3.0 -2.0]; [ 0.0 0.0 -3.0 1.0]; [ 0.0 0.0 1.0 0.0] ] +const Ad32 = [ + [-1.0f0/6.0f0 3.0f0/6.0f0 -3.0f0/6.0f0 1.0f0/6.0f0]; + [ 3.0f0/6.0f0 -6.0f0/6.0f0 0.0f0/6.0f0 4.0f0/6.0f0]; + [-3.0f0/6.0f0 3.0f0/6.0f0 3.0f0/6.0f0 1.0f0/6.0f0]; + [ 1.0f0/6.0f0 0.0f0/6.0f0 0.0f0/6.0f0 0.0f0/6.0f0] +] + +const dAd32 = [ + [ 0.0f0 -0.5f0 1.0f0 -0.5f0]; + [ 0.0f0 1.5f0 -2.0f0 0.0f0]; + [ 0.0f0 -1.5f0 1.0f0 0.5f0]; + [ 0.0f0 0.5f0 0.0f0 0.0f0] +] + +const d2Ad32 = [ + [ 0.0f0 0.0f0 -1.0f0 1.0f0]; + [ 0.0f0 0.0f0 3.0f0 -2.0f0]; + [ 0.0f0 0.0f0 -3.0f0 1.0f0]; + [ 0.0f0 0.0f0 1.0f0 0.0f0] +] + U(s,i) = Symbol(string(s,i)) U(s,i,j) = Symbol(string(s,i,"_",j)) -function create_Phi(d, extrap, diff) +function create_Phi(d, extrap, diff; Tf=Float64) + if Tf==Float64 + Ad = Ad64 + dAd = dAd64 + elseif Tf==Float32 + Ad = Ad32 + dAd = dAd32 + end lines = [] for i=1:d block = [] @@ -113,13 +141,13 @@ tensor_prod(["Phi_1", "Phi_2"], Int64[]) # Phi_1_1 * (Phi_2_1 * C[i1 + 1,i2 + tensor_prod(["Phi_1", "dPhi_2"], Int64[]) # Phi_1_1 * (dPhi_2_1 * C[i1 + 1,i2 + 1] + dPhi_2_2 * C[i1 + 1,i2 + 2] + dPhi_2_3 * C[i1 + 1,i2 + 3] + dPhi_2_4 * C[i1 + 1,i2 + 4]) + Phi_1_2 * (dPhi_2_1 * C[i1 + 2,i2 + 1] + dPhi_2_2 * C[i1 + 2,i2 + 2] + dPhi_2_3 * C[i1 + 2,i2 + 3] + dPhi_2_4 * C[i1 + 2,i2 + 4]) + Phi_1_3 * (dPhi_2_1 * C[i1 + 3,i2 + 1] + dPhi_2_2 * C[i1 + 3,i2 + 2] + dPhi_2_3 * C[i1 + 3,i2 + 3] + dPhi_2_4 * C[i1 + 3,i2 + 4]) + Phi_1_4 * (dPhi_2_1 * C[i1 + 4,i2 + 1] + dPhi_2_2 * C[i1 + 4,i2 + 2] + dPhi_2_3 * C[i1 + 4,i2 + 3] + dPhi_2_4 * C[i1 + 4,i2 + 4]) -function create_parameters(d) +function create_parameters(d; Tf=Float64) lines = [] for i=1:d block = quote $(U("M",i)) = orders[$i] $(U("start",i)) = a[$i] - $(U("dinv",i)) = (orders[$i]-1.0)/(b[$i]-a[$i]) + $(U("dinv",i)) = (orders[$i]-convert($Tf,1.0))/(b[$i]-a[$i]) end for ll in block.args push!(lines, ll) @@ -128,20 +156,21 @@ function create_parameters(d) return lines end -function create_local_parameters(d; vectorize=true) +function create_local_parameters(d; vectorize=true, Tf=Float64) lines = [] if vectorize for i=1:d bl = quote $(U("x",i)) = S[n][$i] $(U("u",i)) = ($(U("x",i)) - $(U("start",i)))*$(U("dinv",i)) - $(U("i",i)) = (floor(Int,$(U("u",i)) )) + # $(U("i",i)) = (floor(Int,$(U("u",i)) )) + $(U("i",i)) = unsafe_trunc(Int32, $(U("u",i))) $(U("i",i)) = max( min($(U("i",i)),$(U("M",i))-2), 0 ) $(U("t",i)) = $(U("u",i))-$(U("i",i)) $(U("tp",i,1)) = $(U("t",i))*$(U("t",i))*$(U("t",i)) $(U("tp",i,2)) = $(U("t",i))*$(U("t",i)) $(U("tp",i,3)) = $(U("t",i)) - $(U("tp",i,4)) = 1.0; + $(U("tp",i,4)) = convert($Tf,1.0); #TODO end for ll in bl.args push!(lines, ll) @@ -153,13 +182,14 @@ function create_local_parameters(d; vectorize=true) bl = quote $(U("x",i)) = S[$i] $(U("u",i)) = ($(U("x",i)) - $(U("start",i)))*$(U("dinv",i)) - $(U("i",i)) = (floor(Int,$(U("u",i)) )) + # $(U("i",i)) = (floor(Int,$(U("u",i)) )) + $(U("i",i)) = unsafe_trunc(Int32, $(U("u",i)) ) $(U("i",i)) = max( min($(U("i",i)),$(U("M",i))-2), 0 ) $(U("t",i)) = $(U("u",i))-$(U("i",i)) $(U("tp",i,1)) = $(U("t",i))*$(U("t",i))*$(U("t",i)) $(U("tp",i,2)) = $(U("t",i))*$(U("t",i)) $(U("tp",i,3)) = $(U("t",i)) - $(U("tp",i,4)) = 1.0; + $(U("tp",i,4)) = convert($Tf,1.0); end for ll in bl.args push!(lines, ll) @@ -169,17 +199,17 @@ function create_local_parameters(d; vectorize=true) return lines end -function create_function(d,extrap="natural"; vectorize=true) +function create_function(d,extrap="natural"; vectorize=true, Tf=Float32) if vectorize expr = quote - function $(Symbol(string("eval_UC_spline!")))( a, b, orders, C::Array{T,d}, S::Vector{SVector{d,Float64}}, V::Vector{T}) where d where T - $(create_parameters(d)...) + function $(Symbol(string("eval_UC_spline!")))( a, b, orders, C::Array{T,d}, S::Vector{SVector{d,$Tf}}, V::Vector{T}) where d where T + $(create_parameters(d; Tf=Tf)...) N = size(S,1) # @fastmath @inbounds @simd( # doesn't seem to make any difference for n=1:N - $(create_local_parameters(d)...) - $(create_Phi(d,extrap,false)...) - V[n] = $( tensor_prod([string("Phi_",i) for i=1:d], Int64[]) ) + $(create_local_parameters(d;Tf=Tf)...) + $(create_Phi(d,extrap,false;Tf=Tf)...) + V[n] = $( tensor_prod([string("Phi_",i) for i=1:d], Int64[]; Tf=Tf) ) end # ) end @@ -187,11 +217,11 @@ function create_function(d,extrap="natural"; vectorize=true) else expr = quote function $(Symbol(string("eval_UC_spline")))( a, b, orders, C::Array{T,d}, S::SVector{d,U}) where d where T where U - $(create_parameters(d)...) + $(create_parameters(d,Tf=Tf)...) N = size(S,1) # @fastmath @inbounds @simd( # doesn't seem to make any difference - $(create_local_parameters(d; vectorize=false)...) - $(create_Phi(d,extrap,false)...) + $(create_local_parameters(d; vectorize=false,Tf=Tf)...) + $(create_Phi(d,extrap,false; Tf=Tf)...) V = $( tensor_prod([string("Phi_",i) for i=1:d], Int64[]) ) # ) return V