From 2bddca31754bbe51362c1b8beff988fa8fd66f59 Mon Sep 17 00:00:00 2001
From: RainerHeintzmann <heintzmann@gmail.com>
Date: Tue, 23 May 2023 22:04:13 +0200
Subject: [PATCH] rem. support for low-D plans to be applied to ND

---
 lib/cufft/fft.jl        | 109 ++++++++++++++++++----------------------
 test/libraries/cufft.jl |   3 ++
 2 files changed, 51 insertions(+), 61 deletions(-)

diff --git a/lib/cufft/fft.jl b/lib/cufft/fft.jl
index 8b517c2bf7..5ccc595a6a 100644
--- a/lib/cufft/fft.jl
+++ b/lib/cufft/fft.jl
@@ -11,7 +11,6 @@ using LinearAlgebra
 Base.:(*)(p::Plan{T}, x::DenseCuArray) where {T} = p * copy1(T, x)
 Base.:(*)(p::ScaledPlan, x::DenseCuArray) = rmul!(p.p * x, p.scale)
 
-
 ## plan structure
 
 # K is an integer flag for forward/backward
@@ -34,7 +33,9 @@ function CUDA.unsafe_free!(plan::CuFFTPlan)
 end
 
 mutable struct cCuFFTPlan{T<:cufftNumber,K,inplace,N} <: CuFFTPlan{T,K,inplace}
-    handle::cufftHandle
+    # handle to Cuda low level plan. Note that this plan sometimes has lower dimensions 
+    # to handle more transform cases such as individual directions
+    handle::cufftHandle 
     ctx::CuContext
     stream::CuStream
     sz::NTuple{N,Int} # Julia size of input array
@@ -170,11 +171,10 @@ function plan_fft!(X::DenseCuArray{T,N}, region) where {T<:cufftComplexes,N}
     region = Tuple(region)
 
     md = plan_max_dims(region, size(X))
-    # X = front_view(X, md)
     sizex = size(X)[1:md]
     handle = cufftGetPlan(xtype, sizex, region)
 
-    cCuFFTPlan{T,K,inplace,md}(handle, sizex, sizex, region, xtype)
+    cCuFFTPlan{T,K,inplace,N}(handle, X, size(X), region, xtype)
 end
 
 
@@ -185,12 +185,10 @@ function plan_bfft!(X::DenseCuArray{T,N}, region) where {T<:cufftComplexes,N}
     region = Tuple(region)
 
     md = plan_max_dims(region, size(X))
-    # X = front_view(X, md)
     sizex = size(X)[1:md]
-
     handle = cufftGetPlan(xtype, sizex, region)
 
-    cCuFFTPlan{T,K,inplace,md}(handle, sizex, sizex, region, xtype)
+    cCuFFTPlan{T,K,inplace,N}(handle, X, size(X), region, xtype)
 end
 
 # out-of-place complex
@@ -201,11 +199,10 @@ function plan_fft(X::DenseCuArray{T,N}, region) where {T<:cufftComplexes,N}
     region = Tuple(region)
 
     md = plan_max_dims(region,size(X))
-    # X = front_view(X, md)
     sizex = size(X)[1:md]
     handle = cufftGetPlan(xtype, sizex, region)
 
-    cCuFFTPlan{T,K,inplace,md}(handle,sizex, sizex, region, xtype)
+    cCuFFTPlan{T,K,inplace,N}(handle, X, size(X), region, xtype)
 end
 
 function plan_bfft(X::DenseCuArray{T,N}, region) where {T<:cufftComplexes,N}
@@ -215,11 +212,10 @@ function plan_bfft(X::DenseCuArray{T,N}, region) where {T<:cufftComplexes,N}
     region = Tuple(region)
 
     md = plan_max_dims(region,size(X))
-    # X = front_view(X, md)
     sizex = size(X)[1:md]
     handle = cufftGetPlan(xtype, sizex, region)
 
-    cCuFFTPlan{T,K,inplace,md}(handle, sizex, sizex, region, xtype)
+    cCuFFTPlan{T,K,inplace,N}(handle, size(X), size(X), region, xtype)
 end
 
 # out-of-place real-to-complex
@@ -235,10 +231,10 @@ function plan_rfft(X::DenseCuArray{T,N}, region) where {T<:cufftReals,N}
 
     handle = cufftGetPlan(xtype, sizex, region)
 
-    ydims = collect(sizex)
+    ydims = collect(size(X))
     ydims[region[1]] = div(ydims[region[1]],2)+1
 
-    rCuFFTPlan{T,K,inplace,md}(handle, sizex, (ydims...,), region, xtype)
+    rCuFFTPlan{T,K,inplace,N}(handle, size(X), (ydims...,), region, xtype)
 end
 
 function plan_brfft(X::DenseCuArray{T,N}, d::Integer, region::Any) where {T<:cufftComplexes,N}
@@ -247,16 +243,12 @@ function plan_brfft(X::DenseCuArray{T,N}, d::Integer, region::Any) where {T<:cuf
     xtype =  (T == cufftComplex) ? CUFFT_C2R : CUFFT_Z2D
     region = Tuple(region)
 
-    md = plan_max_dims(region,size(X))
-    sizex = size(X)[1:md]
-    # X = front_view(X, md)
-
-    ydims = collect(sizex)
+    ydims = collect(size(X))
     ydims[region[1]] = d
 
     handle = cufftGetPlan(xtype, (ydims...,), region)
 
-    rCuFFTPlan{T,K,inplace,md}(handle, sizex, (ydims...,), region, xtype)
+    rCuFFTPlan{T,K,inplace,N}(handle, size(X), (ydims...,), region, xtype)
 end
 
 
@@ -267,44 +259,39 @@ function plan_inv(p::cCuFFTPlan{T,CUFFT_FORWARD,inplace,N}) where {T,N,inplace}
     md = plan_max_dims(p.region, p.sz)
     sizex = p.sz[1:md]
     handle = cufftGetPlan(p.xtype, sizex, p.region)
-    ScaledPlan(cCuFFTPlan{T,CUFFT_INVERSE,inplace,md}(handle, sizex, sizex, p.region,
+    ScaledPlan(cCuFFTPlan{T,CUFFT_INVERSE,inplace,N}(handle,  p.sz,  p.sz, p.region,
                                                      p.xtype),
-                        normalization(real(T), sizex, p.region))
+                        normalization(real(T),  p.sz, p.region))
 end
 
 function plan_inv(p::cCuFFTPlan{T,CUFFT_INVERSE,inplace,N}) where {T,N,inplace}
     md = plan_max_dims(p.region,p.sz)
     sizex = p.sz[1:md]
     handle = cufftGetPlan(p.xtype, sizex, p.region)
-    ScaledPlan(cCuFFTPlan{T,CUFFT_FORWARD,inplace,md}(handle, sizex, sizex, p.region,
+    ScaledPlan(cCuFFTPlan{T,CUFFT_FORWARD,inplace,N}(handle,  p.sz,  p.sz, p.region,
                                                      p.xtype),
-                        normalization(real(T), sizex, p.region))
+                        normalization(real(T),  p.sz, p.region))
 end
 
 function plan_inv(p::rCuFFTPlan{T,CUFFT_INVERSE,inplace,N}
                   ) where {T<:cufftComplexes,N,inplace}
     md_osz = plan_max_dims(p.region, p.osz)
     sz_X = p.osz[1:md_osz]
-    md_sz = plan_max_dims(p.region, p.sz)
-    sz_Y = p.sz[1:md_sz]
-    # Y = CuArray{T}(undef, sz_Y)
     xtype = p.xtype == CUFFT_C2R ? CUFFT_R2C : CUFFT_D2Z
     handle = cufftGetPlan(xtype, sz_X, p.region)
-    ScaledPlan(rCuFFTPlan{real(T),CUFFT_FORWARD,inplace,md_sz}(handle, sz_X, sz_Y, p.region, xtype),
-                        normalization(real(T), sz_X, p.region))
+    ScaledPlan(rCuFFTPlan{real(T),CUFFT_FORWARD,inplace,N}(handle, p.osz, p.sz, p.region, xtype),
+                        normalization(real(T), p.osz, p.region))
 end
 
 function plan_inv(p::rCuFFTPlan{T,CUFFT_FORWARD,inplace,N}
                   ) where {T<:cufftReals,N,inplace}
-    md_osz = plan_max_dims(p.region,p.osz)
-    sz_X = p.osz[1:md_osz]
     md_sz = plan_max_dims(p.region,p.sz)
     sz_Y = p.sz[1:md_sz]
     xtype = p.xtype == CUFFT_R2C ? CUFFT_C2R : CUFFT_Z2D
     handle = cufftGetPlan(xtype, sz_Y, p.region)
-    ScaledPlan(rCuFFTPlan{complex(T),CUFFT_INVERSE,inplace,md_sz}(handle, sz_X, sz_Y,
+    ScaledPlan(rCuFFTPlan{complex(T),CUFFT_INVERSE,inplace,N}(handle, p.osz, p.sz,
                                                               p.region, xtype),
-                        normalization(real(T), sz_Y, p.region))
+                        normalization(real(T), p.sz, p.region))
 end
 
 
@@ -315,14 +302,14 @@ end
 #       see # JuliaGPU/CuArrays.jl#345, NVIDIA/cuFFT#2714055.
 
 function assert_applicable(p::CuFFTPlan{T}, X::DenseCuArray{T}) where {T}
-    (size(X) >= p.sz) ||
+    (size(X) == p.sz) ||
         throw(ArgumentError("CuFFT plan applied to wrong-size input"))
 end
 
 function assert_applicable(p::CuFFTPlan{T,K,inplace}, X::DenseCuArray{T},
                            Y::DenseCuArray) where {T,K,inplace}
     assert_applicable(p, X)
-    if size(Y)[1:length(p.osz)] != p.osz
+    if size(Y) != p.osz
         throw(ArgumentError("CuFFT plan applied to wrong-size output"))
     elseif inplace != (pointer(X) == pointer(Y))
         throw(ArgumentError(string("CuFFT ",
@@ -333,24 +320,24 @@ function assert_applicable(p::CuFFTPlan{T,K,inplace}, X::DenseCuArray{T},
     end
 end
 
-function unsafe_execute!(plan::cCuFFTPlan{cufftComplex,K,<:Any,N},
+function unsafe_execute!(plan::cCuFFTPlan{cufftComplex,K,<:Any,M},
                          x::DenseCuArray{cufftComplex,N},
-                         y::DenseCuArray{cufftComplex,N}) where {K,N}
+                         y::DenseCuArray{cufftComplex,N}) where {K,M,N}
     @assert plan.xtype == CUFFT_C2C
     update_stream(plan)
     cufftExecC2C(plan, x, y, K)
 end
 
-function unsafe_execute!(plan::rCuFFTPlan{cufftComplex,K,true,N},
+function unsafe_execute!(plan::rCuFFTPlan{cufftComplex,K,true,M},
                          x::DenseCuArray{cufftComplex,N},
-                         y::DenseCuArray{cufftReal,N}) where {K,N}
+                         y::DenseCuArray{cufftReal,N}) where {K,M,N}
     @assert plan.xtype == CUFFT_C2R
     update_stream(plan)
     cufftExecC2R(plan, x, y)
 end
-function unsafe_execute!(plan::rCuFFTPlan{cufftComplex,K,false,N},
+function unsafe_execute!(plan::rCuFFTPlan{cufftComplex,K,false,M},
                          x::DenseCuArray{cufftComplex,N},
-                         y::DenseCuArray{cufftReal}) where {K,N}
+                         y::DenseCuArray{cufftReal}) where {K,M,N}
     @assert plan.xtype == CUFFT_C2R
     x = copy(x)
     update_stream(plan)
@@ -358,32 +345,32 @@ function unsafe_execute!(plan::rCuFFTPlan{cufftComplex,K,false,N},
     unsafe_free!(x)
 end
 
-function unsafe_execute!(plan::rCuFFTPlan{cufftReal,K,<:Any,N},
+function unsafe_execute!(plan::rCuFFTPlan{cufftReal,K,<:Any,M},
                          x::DenseCuArray{cufftReal,N},
-                         y::DenseCuArray{cufftComplex,N}) where {K,N}
+                         y::DenseCuArray{cufftComplex,N}) where {K,M,N}
     @assert plan.xtype == CUFFT_R2C
     update_stream(plan)
     cufftExecR2C(plan, x, y)
 end
 
-function unsafe_execute!(plan::cCuFFTPlan{cufftDoubleComplex,K,<:Any,N},
+function unsafe_execute!(plan::cCuFFTPlan{cufftDoubleComplex,K,<:Any,M},
                          x::DenseCuArray{cufftDoubleComplex,N},
-                         y::DenseCuArray{cufftDoubleComplex}) where {K,N}
+                         y::DenseCuArray{cufftDoubleComplex}) where {K,M,N}
     @assert plan.xtype == CUFFT_Z2Z
     update_stream(plan)
     cufftExecZ2Z(plan, x, y, K)
 end
 
-function unsafe_execute!(plan::rCuFFTPlan{cufftDoubleComplex,K,true,N},
+function unsafe_execute!(plan::rCuFFTPlan{cufftDoubleComplex,K,true,M},
                          x::DenseCuArray{cufftDoubleComplex,N},
-                         y::DenseCuArray{cufftDoubleReal}) where {K,N}
+                         y::DenseCuArray{cufftDoubleReal}) where {K,M,N}
     update_stream(plan)
     @assert plan.xtype == CUFFT_Z2D
     cufftExecZ2D(plan, x, y)
 end
-function unsafe_execute!(plan::rCuFFTPlan{cufftDoubleComplex,K,false,N},
+function unsafe_execute!(plan::rCuFFTPlan{cufftDoubleComplex,K,false,M},
                          x::DenseCuArray{cufftDoubleComplex,N},
-                         y::DenseCuArray{cufftDoubleReal}) where {K,N}
+                         y::DenseCuArray{cufftDoubleReal}) where {K,M,N}
     @assert plan.xtype == CUFFT_Z2D
     x = copy(x)
     update_stream(plan)
@@ -391,26 +378,29 @@ function unsafe_execute!(plan::rCuFFTPlan{cufftDoubleComplex,K,false,N},
     unsafe_free!(x)
 end
 
-function unsafe_execute!(plan::rCuFFTPlan{cufftDoubleReal,K,<:Any,N},
+function unsafe_execute!(plan::rCuFFTPlan{cufftDoubleReal,K,<:Any,M},
                          x::DenseCuArray{cufftDoubleReal,N},
-                         y::DenseCuArray{cufftDoubleComplex,N}) where {K,N}
+                         y::DenseCuArray{cufftDoubleComplex,N}) where {K,M,N}
     @assert plan.xtype == CUFFT_D2Z
     update_stream(plan)
     cufftExecD2Z(plan, x, y)
 end
 
 # a version of unsafe_execute which applies the plan to each element of trailing dimensions not covered by the plan.
-function unsafe_execute_trailing!(p,x, y)
-    N = ndims(p)
+# Note that for plans, with trailing non-transform dimensions views are created for each of such elements.
+# Such views each have lower dimensions and are then transformed by the lower dimension low-level Cuda plan.
+function unsafe_execute_trailing!(p, x, y)
+    N = plan_max_dims(p.region, p.osz)
     M = ndims(x)
     d = p.region[end]
-    if M == N  # p.region[1] == 1 || ndims(x) == d
+    if M == N  
         unsafe_execute!(p,x,y)
     else
         front_ids = ntuple((dd)->Colon(), d)
         for c in CartesianIndices(size(x)[d+1:end])
-            vx = @view x[front_ids..., Tuple(c)...]
-            vy = @view y[front_ids..., Tuple(c)...]
+            ids = ntuple((dd)->c[dd], M-N)
+            vx = @view x[front_ids..., ids...]
+            vy = @view y[front_ids..., ids...]
             unsafe_execute!(p,vx,vy)
         end
     end
@@ -435,8 +425,7 @@ function Base.:(*)(p::rCuFFTPlan{T,CUFFT_FORWARD,false,N}, x::DenseCuArray{T,M}
            ) where {T<:cufftReals,N,M}
     assert_applicable(p,x)
     @assert p.xtype ∈ [CUFFT_R2C,CUFFT_D2Z]
-    osz = get_osz(p.osz, x)
-    y = CuArray{complex(T),M}(undef, osz)
+    y = CuArray{complex(T),M}(undef, p.osz)
     unsafe_execute_trailing!(p,x, y)
     y
 end
@@ -445,8 +434,7 @@ function Base.:(*)(p::rCuFFTPlan{T,CUFFT_INVERSE,false,N}, x::DenseCuArray{T,M}
            ) where {T<:cufftComplexes,N,M}
     assert_applicable(p,x)
     @assert p.xtype ∈ [CUFFT_C2R,CUFFT_Z2D]
-    osz = get_osz(p.osz, x)
-    y = CuArray{real(T),M}(undef, osz)
+    y = CuArray{real(T),M}(undef, p.osz)
     unsafe_execute_trailing!(p,x, y)
     y
 end
@@ -459,8 +447,7 @@ end
 
 function Base.:(*)(p::cCuFFTPlan{T,K,false,N}, x::DenseCuArray{T,M}) where {T,K,N,M}
     assert_applicable(p,x)
-    osz = get_osz(p.osz, x)
-    y = CuArray{T,M}(undef, osz)
+    y = CuArray{T,M}(undef, p.osz)
     unsafe_execute_trailing!(p,x, y)
     y
 end
diff --git a/test/libraries/cufft.jl b/test/libraries/cufft.jl
index ab14313981..6c3cbfa200 100644
--- a/test/libraries/cufft.jl
+++ b/test/libraries/cufft.jl
@@ -61,6 +61,9 @@ function batched(X::AbstractArray{T,N},region) where {T <: Complex,N}
     d_X = CuArray(X)
     p = plan_fft(d_X,region)
     d_Y = p * d_X
+    d_X2 = reshape(d_X, (size(d_X)..., 1))
+    @test_throws ArgumentError p * d_X2
+
     Y = collect(d_Y)
     @test isapprox(Y, fftw_X, rtol = MYRTOL, atol = MYATOL)