diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index a43b659021..3221c902c8 100755 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -609,6 +609,16 @@ steps: - "julia --color=yes --check-bounds=yes --project=.buildkite test/Operators/hybrid/unit_2d.jl" - "julia --color=yes --check-bounds=yes --project=.buildkite test/Operators/hybrid/convergence_2d.jl" + - label: "Unit: hyb ops 2d CUDA" + key: unit_hyb_ops_2d_cuda + command: + - "julia --color=yes --check-bounds=yes --project=.buildkite test/Operators/hybrid/unit_2d.jl" + - "julia --color=yes --check-bounds=yes --project=.buildkite test/Operators/hybrid/convergence_2d.jl" + env: + CLIMACOMMS_DEVICE: "CUDA" + agents: + slurm_gpus: 1 + - label: "Unit: hyb ops 3d" key: unit_hyb_ops_3d command: diff --git a/ext/cuda/data_layouts_mapreduce.jl b/ext/cuda/data_layouts_mapreduce.jl index 551b8d451d..74783b6d78 100644 --- a/ext/cuda/data_layouts_mapreduce.jl +++ b/ext/cuda/data_layouts_mapreduce.jl @@ -24,13 +24,7 @@ end function mapreduce_cuda( f, op, - data::Union{ - DataLayouts.VF, - DataLayouts.IJFH, - DataLayouts.IJHF, - DataLayouts.VIJFH, - DataLayouts.VIJHF, - }; + data::DataLayouts.AbstractData; weighted_jacobian = OnesArray(parent(data)), opargs..., ) @@ -132,9 +126,9 @@ function mapreduce_cuda_kernel!( gidx = _get_gidx(tidx, bidx, effective_blksize) reduction = CUDA.CuStaticSharedArray(T, shmemsize) reduction[tidx] = 0 - (Nij, _, _, Nv, Nh) = DataLayouts.universal_size(us) + (Ni, Nj, _, Nv, Nh) = DataLayouts.universal_size(us) Nf = 1 # a view into `fidx` always gives a size of Nf = 1 - nitems = Nv * Nij * Nij * Nf * Nh + nitems = Nv * Ni * Nj * Nf * Nh # load shmem if gidx ≤ nitems diff --git a/ext/cuda/data_layouts_threadblock.jl b/ext/cuda/data_layouts_threadblock.jl index 91cd0191fb..43f64319af 100644 --- a/ext/cuda/data_layouts_threadblock.jl +++ b/ext/cuda/data_layouts_threadblock.jl @@ -213,15 +213,15 @@ end us::DataLayouts.UniversalSize, n_max_threads::Integer, ) - (Nij, _, _, _, Nh) = DataLayouts.universal_size(us) + (Ni, Nj, _, _, Nh) = DataLayouts.universal_size(us) Nh_thread = min( - Int(fld(n_max_threads, Nij * Nij)), + Int(fld(n_max_threads, Ni * Nj)), maximum_allowable_threads()[3], Nh, ) Nh_blocks = cld(Nh, Nh_thread) - @assert prod((Nij, Nij, Nh_thread)) ≤ n_max_threads "threads,n_max_threads=($(prod((Nij, Nij, Nh_thread))),$n_max_threads)" - return (; threads = (Nij, Nij, Nh_thread), blocks = (Nh_blocks,)) + @assert prod((Ni, Nj, Nh_thread)) ≤ n_max_threads "threads,n_max_threads=($(prod((Ni, Nj, Nh_thread))),$n_max_threads)" + return (; threads = (Ni, Nj, Nh_thread), blocks = (Nh_blocks,)) end @inline function columnwise_universal_index(us::UniversalSize) (i, j, th) = CUDA.threadIdx() @@ -241,9 +241,9 @@ end n_max_threads::Integer; Nnames, ) - (Nij, _, _, _, Nh) = DataLayouts.universal_size(us) - @assert prod((Nij, Nij, Nnames)) ≤ n_max_threads "threads,n_max_threads=($(prod((Nij, Nij, Nnames))),$n_max_threads)" - return (; threads = (Nij, Nij, Nnames), blocks = (Nh,)) + (Ni, Nj, _, _, Nh) = DataLayouts.universal_size(us) + @assert prod((Ni, Nj, Nnames)) ≤ n_max_threads "threads,n_max_threads=($(prod((Ni, Nj, Nnames))),$n_max_threads)" + return (; threads = (Ni, Nj, Nnames), blocks = (Nh,)) end @inline function multiple_field_solve_universal_index(us::UniversalSize) (i, j, iname) = CUDA.threadIdx() @@ -258,12 +258,12 @@ end us::DataLayouts.UniversalSize, n_max_threads::Integer = 256; ) - (Nq, _, _, Nv, Nh) = DataLayouts.universal_size(us) - Nvthreads = min(fld(n_max_threads, Nq * Nq), maximum_allowable_threads()[3]) + (Ni, Nj, _, Nv, Nh) = DataLayouts.universal_size(us) + Nvthreads = min(fld(n_max_threads, Ni * Nj), maximum_allowable_threads()[3]) Nvblocks = cld(Nv, Nvthreads) - @assert prod((Nq, Nq, Nvthreads)) ≤ n_max_threads "threads,n_max_threads=($(prod((Nq, Nq, Nvthreads))),$n_max_threads)" - @assert Nq * Nq ≤ n_max_threads - return (; threads = (Nq, Nq, Nvthreads), blocks = (Nh, Nvblocks), Nvthreads) + @assert prod((Ni, Nj, Nvthreads)) ≤ n_max_threads "threads,n_max_threads=($(prod((Ni, Nj, Nvthreads))),$n_max_threads)" + @assert Ni * Nj ≤ n_max_threads + return (; threads = (Ni, Nj, Nvthreads), blocks = (Nh, Nvblocks), Nvthreads) end @inline function spectral_universal_index(space::Spaces.AbstractSpace) i = threadIdx().x diff --git a/ext/cuda/operators_sem_shmem.jl b/ext/cuda/operators_sem_shmem.jl index d7af370671..f5132b9dd7 100644 --- a/ext/cuda/operators_sem_shmem.jl +++ b/ext/cuda/operators_sem_shmem.jl @@ -4,6 +4,20 @@ import ClimaCore.Operators: Divergence, WeakDivergence, Gradient, WeakGradient, Curl, WeakCurl import ClimaCore.Operators: operator_return_eltype, get_local_geometry +Base.@propagate_inbounds function operator_shmem( + space, + ::Val{Nvt}, + op::Operators.Divergence{(1,)}, + arg, +) where {Nvt} + FT = Spaces.undertype(space) + QS = Spaces.quadrature_style(space) + Nq = Quadratures.degrees_of_freedom(QS) + # allocate temp output + RT = Operators.operator_return_eltype(op, eltype(arg)) + Jv¹ = CUDA.CuStaticSharedArray(RT, (Nq, Nvt)) + return (Jv¹,) +end Base.@propagate_inbounds function operator_shmem( space, ::Val{Nvt}, @@ -20,6 +34,23 @@ Base.@propagate_inbounds function operator_shmem( return (Jv¹, Jv²) end +Base.@propagate_inbounds function operator_fill_shmem!( + op::Divergence{(1,)}, + (Jv¹,), + space, + ij, + slabidx, + arg, +) + vt = threadIdx().z + local_geometry = get_local_geometry(space, ij, slabidx) + i, _ = ij.I + Jv¹[i, vt] = + local_geometry.J ⊠ RecursiveApply.rmap( + v -> Geometry.contravariant1(v, local_geometry), + arg, + ) +end Base.@propagate_inbounds function operator_fill_shmem!( op::Divergence{(1, 2)}, (Jv¹, Jv²), @@ -44,6 +75,21 @@ Base.@propagate_inbounds function operator_fill_shmem!( ) end +Base.@propagate_inbounds function operator_shmem( + space, + ::Val{Nvt}, + op::WeakDivergence{(1,)}, + arg, +) where {Nvt} + FT = Spaces.undertype(space) + QS = Spaces.quadrature_style(space) + Nq = Quadratures.degrees_of_freedom(QS) + # allocate temp output + RT = operator_return_eltype(op, eltype(arg)) + Nf = DataLayouts.typesize(FT, RT) + WJv¹ = CUDA.CuStaticSharedArray(RT, (Nq, Nvt)) + return (WJv¹,) +end Base.@propagate_inbounds function operator_shmem( space, ::Val{Nvt}, @@ -61,6 +107,23 @@ Base.@propagate_inbounds function operator_shmem( return (WJv¹, WJv²) end +Base.@propagate_inbounds function operator_fill_shmem!( + op::WeakDivergence{(1,)}, + (WJv¹,), + space, + ij, + slabidx, + arg, +) + vt = threadIdx().z + local_geometry = get_local_geometry(space, ij, slabidx) + i, _ = ij.I + WJv¹[i, vt] = + local_geometry.WJ ⊠ RecursiveApply.rmap( + v -> Geometry.contravariant1(v, local_geometry), + arg, + ) +end Base.@propagate_inbounds function operator_fill_shmem!( op::WeakDivergence{(1, 2)}, (WJv¹, WJv²), @@ -85,46 +148,39 @@ Base.@propagate_inbounds function operator_fill_shmem!( ) end +Base.@propagate_inbounds function operator_shmem( + space, + ::Val{Nvt}, + op::Gradient{(1,)}, + arg, +) where {Nvt} + QS = Spaces.quadrature_style(space) + Nq = Quadratures.degrees_of_freedom(QS) + return CUDA.CuStaticSharedArray(eltype(arg), (Nq, Nvt)) +end Base.@propagate_inbounds function operator_shmem( space, ::Val{Nvt}, op::Gradient{(1, 2)}, arg, ) where {Nvt} - FT = Spaces.undertype(space) QS = Spaces.quadrature_style(space) Nq = Quadratures.degrees_of_freedom(QS) - IT = eltype(arg) - ET = eltype(IT) - RT = operator_return_eltype(op, IT) - RT12 = Geometry.AxisTensor{ - ET, - 2, - Tuple{Geometry.CovariantAxis{(1, 2)}, Geometry.LocalAxis{(1, 2)}}, - SMatrix{2, 2, ET, 4}, - } - RT123 = Geometry.AxisTensor{ - ET, - 2, - Tuple{Geometry.CovariantAxis{(1, 2)}, Geometry.LocalAxis{(1, 2, 3)}}, - SMatrix{2, 3, ET, 6}, - } - if RT <: Geometry.Covariant12Vector - # allocate temp output - input = CUDA.CuStaticSharedArray(ET, (Nq, Nq, Nvt)) - return (input,) - elseif RT <: RT12 - v₁ = CUDA.CuStaticSharedArray(ET, (Nq, Nq, Nvt)) - v₂ = CUDA.CuStaticSharedArray(ET, (Nq, Nq, Nvt)) - return (v₁, v₂) - elseif RT <: RT123 - v₁ = CUDA.CuStaticSharedArray(ET, (Nq, Nq, Nvt)) - v₂ = CUDA.CuStaticSharedArray(ET, (Nq, Nq, Nvt)) - v₃ = CUDA.CuStaticSharedArray(ET, (Nq, Nq, Nvt)) - return (v₁, v₂, v₃) - end + return CUDA.CuStaticSharedArray(eltype(arg), (Nq, Nq, Nvt)) end +Base.@propagate_inbounds function operator_fill_shmem!( + op::Gradient{(1,)}, + input, + space, + ij, + slabidx, + arg, +) + vt = threadIdx().z + i, _ = ij.I + input[i, vt] = arg +end Base.@propagate_inbounds function operator_fill_shmem!( op::Gradient{(1, 2)}, input, @@ -135,37 +191,23 @@ Base.@propagate_inbounds function operator_fill_shmem!( ) vt = threadIdx().z i, j = ij.I - local_geometry = get_local_geometry(space, ij, slabidx) - RT = operator_return_eltype(op, typeof(arg)) - ET = eltype(eltype(arg)) - RT12 = Geometry.AxisTensor{ - ET, - 2, - Tuple{Geometry.CovariantAxis{(1, 2)}, Geometry.LocalAxis{(1, 2)}}, - SMatrix{2, 2, ET, 4}, - } - RT123 = Geometry.AxisTensor{ - ET, - 2, - Tuple{Geometry.CovariantAxis{(1, 2)}, Geometry.LocalAxis{(1, 2, 3)}}, - SMatrix{2, 3, ET, 6}, - } - if RT <: Geometry.Covariant12Vector - (v,) = input - v[i, j, vt] = arg - elseif RT <: RT12 - v₁, v₂ = input - v₁[i, j, vt] = Geometry.LocalVector(arg, local_geometry).u - v₂[i, j, vt] = Geometry.LocalVector(arg, local_geometry).v - elseif RT <: RT123 - v₁, v₂, v₃ = input - v₁[i, j, vt] = Geometry.LocalVector(arg, local_geometry).u - v₂[i, j, vt] = Geometry.LocalVector(arg, local_geometry).v - v₃[i, j, vt] = Geometry.LocalVector(arg, local_geometry).w - end + input[i, j, vt] = arg end - +Base.@propagate_inbounds function operator_shmem( + space, + ::Val{Nvt}, + op::WeakGradient{(1,)}, + arg, +) where {Nvt} + FT = Spaces.undertype(space) + QS = Spaces.quadrature_style(space) + Nq = Quadratures.degrees_of_freedom(QS) + # allocate temp output + IT = eltype(arg) + Wf = CUDA.CuStaticSharedArray(IT, (Nq, Nvt)) + return (Wf,) +end Base.@propagate_inbounds function operator_shmem( space, ::Val{Nvt}, @@ -181,6 +223,20 @@ Base.@propagate_inbounds function operator_shmem( return (Wf,) end +Base.@propagate_inbounds function operator_fill_shmem!( + op::WeakGradient{(1,)}, + (Wf,), + space, + ij, + slabidx, + arg, +) + vt = threadIdx().z + local_geometry = get_local_geometry(space, ij, slabidx) + W = local_geometry.WJ * local_geometry.invJ + i, _ = ij.I + Wf[i, vt] = W ⊠ arg +end Base.@propagate_inbounds function operator_fill_shmem!( op::WeakGradient{(1, 2)}, (Wf,), @@ -196,7 +252,33 @@ Base.@propagate_inbounds function operator_fill_shmem!( Wf[i, j, vt] = W ⊠ arg end - +Base.@propagate_inbounds function operator_shmem( + space, + ::Val{Nvt}, + op::Curl{(1,)}, + arg, +) where {Nvt} + FT = Spaces.undertype(space) + QS = Spaces.quadrature_style(space) + Nq = Quadratures.degrees_of_freedom(QS) + IT = eltype(arg) + ET = eltype(IT) + RT = operator_return_eltype(op, IT) + # allocate temp output + if RT <: Geometry.Contravariant3Vector # input is a Covariant12Vector + v₂ = CUDA.CuStaticSharedArray(ET, (Nq, Nvt)) + return (nothing, v₂) + elseif RT <: Geometry.Contravariant2Vector # input is a Covariant3Vector + v₃ = CUDA.CuStaticSharedArray(ET, (Nq, Nvt)) + return (v₃,) + elseif RT <: Geometry.Contravariant23Vector # input is a Covariant123Vector + v₂ = CUDA.CuStaticSharedArray(ET, (Nq, Nvt)) + v₃ = CUDA.CuStaticSharedArray(ET, (Nq, Nvt)) + return (nothing, v₂, v₃) + else + error("invalid return type") + end +end Base.@propagate_inbounds function operator_shmem( space, ::Val{Nvt}, @@ -210,15 +292,14 @@ Base.@propagate_inbounds function operator_shmem( ET = eltype(IT) RT = operator_return_eltype(op, IT) # allocate temp output - if RT <: Geometry.Contravariant3Vector - # input data is a Covariant12Vector field + if RT <: Geometry.Contravariant3Vector # input is a Covariant12Vector v₁ = CUDA.CuStaticSharedArray(ET, (Nq, Nq, Nvt)) v₂ = CUDA.CuStaticSharedArray(ET, (Nq, Nq, Nvt)) return (v₁, v₂) - elseif RT <: Geometry.Contravariant12Vector + elseif RT <: Geometry.Contravariant12Vector # input is a Covariant3Vector v₃ = CUDA.CuStaticSharedArray(ET, (Nq, Nq, Nvt)) return (v₃,) - elseif RT <: Geometry.Contravariant123Vector + elseif RT <: Geometry.Contravariant123Vector # input is a Covariant123Vector v₁ = CUDA.CuStaticSharedArray(ET, (Nq, Nq, Nvt)) v₂ = CUDA.CuStaticSharedArray(ET, (Nq, Nq, Nvt)) v₃ = CUDA.CuStaticSharedArray(ET, (Nq, Nq, Nvt)) @@ -228,6 +309,30 @@ Base.@propagate_inbounds function operator_shmem( end end +Base.@propagate_inbounds function operator_fill_shmem!( + op::Curl{(1,)}, + work, + space, + ij, + slabidx, + arg, +) + vt = threadIdx().z + i, _ = ij.I + local_geometry = get_local_geometry(space, ij, slabidx) + RT = operator_return_eltype(op, typeof(arg)) + if RT <: Geometry.Contravariant3Vector + _, v₂ = work + v₂[i, vt] = Geometry.covariant2(arg, local_geometry) + elseif RT <: Geometry.Contravariant2Vector + (v₃,) = work + v₃[i, vt] = Geometry.covariant3(arg, local_geometry) + else + _, v₂, v₃ = work + v₂[i, vt] = Geometry.covariant2(arg, local_geometry) + v₃[i, vt] = Geometry.covariant3(arg, local_geometry) + end +end Base.@propagate_inbounds function operator_fill_shmem!( op::Curl{(1, 2)}, work, @@ -255,8 +360,33 @@ Base.@propagate_inbounds function operator_fill_shmem!( end end - - +Base.@propagate_inbounds function operator_shmem( + space, + ::Val{Nvt}, + op::WeakCurl{(1,)}, + arg, +) where {Nvt} + FT = Spaces.undertype(space) + QS = Spaces.quadrature_style(space) + Nq = Quadratures.degrees_of_freedom(QS) + IT = eltype(arg) + ET = eltype(IT) + RT = operator_return_eltype(op, IT) + # allocate temp output + if RT <: Geometry.Contravariant3Vector # input is a Covariant12Vector + Wv₂ = CUDA.CuStaticSharedArray(ET, (Nq, Nvt)) + return (nothing, Wv₂) + elseif RT <: Geometry.Contravariant2Vector # input is a Covariant3Vector + Wv₃ = CUDA.CuStaticSharedArray(ET, (Nq, Nvt)) + return (Wv₃,) + elseif RT <: Geometry.Contravariant23Vector # input is a Covariant123Vector + Wv₂ = CUDA.CuStaticSharedArray(ET, (Nq, Nvt)) + Wv₃ = CUDA.CuStaticSharedArray(ET, (Nq, Nvt)) + return (nothing, Wv₂, Wv₃) + else + error("invalid return type") + end +end Base.@propagate_inbounds function operator_shmem( space, ::Val{Nvt}, @@ -270,15 +400,14 @@ Base.@propagate_inbounds function operator_shmem( ET = eltype(IT) RT = operator_return_eltype(op, IT) # allocate temp output - if RT <: Geometry.Contravariant3Vector - # input data is a Covariant12Vector field + if RT <: Geometry.Contravariant3Vector # input is a Covariant12Vector Wv₁ = CUDA.CuStaticSharedArray(ET, (Nq, Nq, Nvt)) Wv₂ = CUDA.CuStaticSharedArray(ET, (Nq, Nq, Nvt)) return (Wv₁, Wv₂) - elseif RT <: Geometry.Contravariant12Vector + elseif RT <: Geometry.Contravariant12Vector # input is a Covariant3Vector Wv₃ = CUDA.CuStaticSharedArray(ET, (Nq, Nq, Nvt)) return (Wv₃,) - elseif RT <: Geometry.Contravariant123Vector + elseif RT <: Geometry.Contravariant123Vector # input is a Covariant123Vector Wv₁ = CUDA.CuStaticSharedArray(ET, (Nq, Nq, Nvt)) Wv₂ = CUDA.CuStaticSharedArray(ET, (Nq, Nq, Nvt)) Wv₃ = CUDA.CuStaticSharedArray(ET, (Nq, Nq, Nvt)) @@ -288,6 +417,31 @@ Base.@propagate_inbounds function operator_shmem( end end +Base.@propagate_inbounds function operator_fill_shmem!( + op::WeakCurl{(1,)}, + work, + space, + ij, + slabidx, + arg, +) + vt = threadIdx().z + i, _ = ij.I + local_geometry = get_local_geometry(space, ij, slabidx) + W = local_geometry.WJ * local_geometry.invJ + RT = operator_return_eltype(op, typeof(arg)) + if RT <: Geometry.Contravariant3Vector + _, Wv₂ = work + Wv₂[i, vt] = W ⊠ Geometry.covariant2(arg, local_geometry) + elseif RT <: Geometry.Contravariant2Vector + (Wv₃,) = work + Wv₃[i, vt] = W ⊠ Geometry.covariant3(arg, local_geometry) + else + _, Wv₂, Wv₃ = work + Wv₂[i, vt] = W ⊠ Geometry.covariant2(arg, local_geometry) + Wv₃[i, vt] = W ⊠ Geometry.covariant3(arg, local_geometry) + end +end Base.@propagate_inbounds function operator_fill_shmem!( op::WeakCurl{(1, 2)}, work, diff --git a/ext/cuda/operators_spectral_element.jl b/ext/cuda/operators_spectral_element.jl index d22ce173a7..141e17f82a 100644 --- a/ext/cuda/operators_spectral_element.jl +++ b/ext/cuda/operators_spectral_element.jl @@ -177,6 +177,29 @@ Base.@propagate_inbounds function resolve_shmem!(obj, ij, slabidx) nothing end +Base.@propagate_inbounds function operator_evaluate( + op::Divergence{(1,)}, + (Jv¹,), + space, + ij, + slabidx, +) + vt = threadIdx().z + i, _ = ij.I + + FT = Spaces.undertype(space) + QS = Spaces.quadrature_style(space) + Nq = Quadratures.degrees_of_freedom(QS) + D = Quadratures.differentiation_matrix(FT, QS) + + local_geometry = get_local_geometry(space, ij, slabidx) + + DJv = D[i, 1] ⊠ Jv¹[1, vt] + for k in 2:Nq + DJv = DJv ⊞ D[i, k] ⊠ Jv¹[k, vt] + end + return RecursiveApply.rmul(DJv, local_geometry.invJ) +end Base.@propagate_inbounds function operator_evaluate( op::Divergence{(1, 2)}, (Jv¹, Jv²), @@ -204,6 +227,29 @@ Base.@propagate_inbounds function operator_evaluate( return RecursiveApply.rmul(DJv, local_geometry.invJ) end +Base.@propagate_inbounds function operator_evaluate( + op::WeakDivergence{(1,)}, + (WJv¹,), + space, + ij, + slabidx, +) + vt = CUDA.threadIdx().z + i, _ = ij.I + + FT = Spaces.undertype(space) + QS = Spaces.quadrature_style(space) + Nq = Quadratures.degrees_of_freedom(QS) + D = Quadratures.differentiation_matrix(FT, QS) + + local_geometry = get_local_geometry(space, ij, slabidx) + + Dᵀ₁WJv¹ = D[1, i] ⊠ WJv¹[1, vt] + for k in 2:Nq + Dᵀ₁WJv¹ = Dᵀ₁WJv¹ ⊞ D[k, i] ⊠ WJv¹[k, vt] + end + return ⊟(RecursiveApply.rdiv(Dᵀ₁WJv¹, local_geometry.WJ)) +end Base.@propagate_inbounds function operator_evaluate( op::WeakDivergence{(1, 2)}, (WJv¹, WJv²), @@ -230,6 +276,37 @@ Base.@propagate_inbounds function operator_evaluate( return ⊟(RecursiveApply.rdiv(Dᵀ₁WJv¹ ⊞ Dᵀ₂WJv², local_geometry.WJ)) end +Base.@propagate_inbounds function operator_evaluate( + op::Gradient{(1,)}, + input, + space, + ij, + slabidx, +) + vt = threadIdx().z + i, _ = ij.I + + FT = Spaces.undertype(space) + QS = Spaces.quadrature_style(space) + Nq = Quadratures.degrees_of_freedom(QS) + D = Quadratures.differentiation_matrix(FT, QS) + + @inbounds begin + ∂f∂ξ₁ = D[i, 1] ⊠ input[1, vt] + for k in 2:Nq + ∂f∂ξ₁ = ∂f∂ξ₁ ⊞ D[i, k] ⊠ input[k, vt] + end + end + if eltype(input) <: Number + return Geometry.Covariant1Vector(∂f∂ξ₁) + elseif eltype(input) <: Geometry.AxisVector + tensor_axes = (Geometry.Covariant1Axis(), axes(eltype(input))[1]) + tensor_components = hcat(Geometry.components(∂f∂ξ₁))' + return Geometry.AxisTensor(tensor_axes, tensor_components) + else + error("Unsupported input type for gradient operator: $(eltype(input))") + end +end Base.@propagate_inbounds function operator_evaluate( op::Gradient{(1, 2)}, input, @@ -245,61 +322,50 @@ Base.@propagate_inbounds function operator_evaluate( Nq = Quadratures.degrees_of_freedom(QS) D = Quadratures.differentiation_matrix(FT, QS) - if length(input) == 1 # check types - (v₁,) = input - @inbounds begin - ∂f∂ξ₁ = D[i, 1] ⊠ v₁[1, j, vt] - ∂f∂ξ₂ = D[j, 1] ⊠ v₁[i, 1, vt] - for k in 2:Nq - ∂f∂ξ₁ = ∂f∂ξ₁ ⊞ D[i, k] ⊠ v₁[k, j, vt] - ∂f∂ξ₂ = ∂f∂ξ₂ ⊞ D[j, k] ⊠ v₁[i, k, vt] - end + @inbounds begin + ∂f∂ξ₁ = D[i, 1] ⊠ input[1, j, vt] + ∂f∂ξ₂ = D[j, 1] ⊠ input[i, 1, vt] + for k in 2:Nq + ∂f∂ξ₁ = ∂f∂ξ₁ ⊞ D[i, k] ⊠ input[k, j, vt] + ∂f∂ξ₂ = ∂f∂ξ₂ ⊞ D[j, k] ⊠ input[i, k, vt] end + end + if eltype(input) <: Number return Geometry.Covariant12Vector(∂f∂ξ₁, ∂f∂ξ₂) - elseif length(input) == 2 - # Update `shmem` - v₁, v₂ = input - @inbounds begin - ∂f₁∂ξ₁ = D[i, 1] ⊠ v₁[1, j, vt] - ∂f₁∂ξ₂ = D[j, 1] ⊠ v₁[i, 1, vt] - ∂f₂∂ξ₁ = D[i, 1] ⊠ v₂[1, j, vt] - ∂f₂∂ξ₂ = D[j, 1] ⊠ v₂[i, 1, vt] - @simd for k in 2:Nq - ∂f₁∂ξ₁ = ∂f₁∂ξ₁ ⊞ D[i, k] ⊠ v₁[k, j, vt] - ∂f₁∂ξ₂ = ∂f₁∂ξ₂ ⊞ D[j, k] ⊠ v₁[i, k, vt] - ∂f₂∂ξ₁ = ∂f₂∂ξ₁ ⊞ D[i, k] ⊠ v₂[k, j, vt] - ∂f₂∂ξ₂ = ∂f₂∂ξ₂ ⊞ D[j, k] ⊠ v₂[i, k, vt] - end - end - return Geometry.AxisTensor( - (Geometry.Covariant12Axis(), Geometry.UVAxis()), - (∂f₁∂ξ₁, ∂f₁∂ξ₂, ∂f₂∂ξ₁, ∂f₂∂ξ₂), - ) + elseif eltype(input) <: Geometry.AxisVector + tensor_axes = (Geometry.Covariant12Axis(), axes(eltype(input))[1]) + tensor_components = + hcat(Geometry.components(∂f∂ξ₁), Geometry.components(∂f∂ξ₂))' + return Geometry.AxisTensor(tensor_axes, tensor_components) else - v₁, v₂, v₃ = input - @inbounds begin - ∂f₁∂ξ₁ = D[i, 1] ⊠ v₁[1, j, vt] - ∂f₁∂ξ₂ = D[j, 1] ⊠ v₁[i, 1, vt] - ∂f₂∂ξ₁ = D[i, 1] ⊠ v₂[1, j, vt] - ∂f₂∂ξ₂ = D[j, 1] ⊠ v₂[i, 1, vt] - ∂f₃∂ξ₁ = D[i, 1] ⊠ v₃[1, j, vt] - ∂f₃∂ξ₂ = D[j, 1] ⊠ v₃[i, 1, vt] - @simd for k in 2:Nq - ∂f₁∂ξ₁ = ∂f₁∂ξ₁ ⊞ D[i, k] ⊠ v₁[k, j, vt] - ∂f₁∂ξ₂ = ∂f₁∂ξ₂ ⊞ D[j, k] ⊠ v₁[i, k, vt] - ∂f₂∂ξ₁ = ∂f₂∂ξ₁ ⊞ D[i, k] ⊠ v₂[k, j, vt] - ∂f₂∂ξ₂ = ∂f₂∂ξ₂ ⊞ D[j, k] ⊠ v₂[i, k, vt] - ∂f₃∂ξ₁ = ∂f₃∂ξ₁ ⊞ D[i, k] ⊠ v₃[k, j, vt] - ∂f₃∂ξ₂ = ∂f₃∂ξ₂ ⊞ D[j, k] ⊠ v₃[i, k, vt] - end - end - return Geometry.AxisTensor( - (Geometry.Covariant12Axis(), Geometry.UVWAxis()), - (∂f₁∂ξ₁, ∂f₁∂ξ₂, ∂f₂∂ξ₁, ∂f₂∂ξ₂, ∂f₃∂ξ₁, ∂f₃∂ξ₂), - ) + error("Unsupported input type for gradient operator: $(eltype(input))") end end +Base.@propagate_inbounds function operator_evaluate( + op::WeakGradient{(1,)}, + (Wf,), + space, + ij, + slabidx, +) + vt = threadIdx().z + i, _ = ij.I + + FT = Spaces.undertype(space) + QS = Spaces.quadrature_style(space) + Nq = Quadratures.degrees_of_freedom(QS) + D = Quadratures.differentiation_matrix(FT, QS) + + local_geometry = get_local_geometry(space, ij, slabidx) + W = local_geometry.WJ * local_geometry.invJ + + Dᵀ₁Wf = D[1, i] ⊠ Wf[1, vt] + for k in 2:Nq + Dᵀ₁Wf = Dᵀ₁Wf ⊞ D[k, i] ⊠ Wf[k, vt] + end + return Geometry.Covariant1Vector(⊟(RecursiveApply.rdiv(Dᵀ₁Wf, W))) +end Base.@propagate_inbounds function operator_evaluate( op::WeakGradient{(1, 2)}, (Wf,), @@ -330,6 +396,54 @@ Base.@propagate_inbounds function operator_evaluate( ) end +Base.@propagate_inbounds function operator_evaluate( + op::Curl{(1,)}, + work, + space, + ij, + slabidx, +) + vt = threadIdx().z + i, _ = ij.I + + FT = Spaces.undertype(space) + QS = Spaces.quadrature_style(space) + Nq = Quadratures.degrees_of_freedom(QS) + D = Quadratures.differentiation_matrix(FT, QS) + local_geometry = get_local_geometry(space, ij, slabidx) + + if length(work) == 2 + _, v₂ = work + D₁v₂ = D[i, 1] ⊠ v₂[1, vt] + for k in 2:Nq + D₁v₂ = D₁v₂ ⊞ D[i, k] ⊠ v₂[k, vt] + end + return Geometry.Contravariant3Vector( + RecursiveApply.rmul(D₁v₂, local_geometry.invJ), + ) + elseif length(work) == 1 + (v₃,) = work + D₁v₃ = D[i, 1] ⊠ v₃[1, vt] + for k in 2:Nq + D₁v₃ = D₁v₃ ⊞ D[i, k] ⊠ v₃[k, vt] + end + return Geometry.Contravariant2Vector( + ⊟(RecursiveApply.rmul(D₁v₃, local_geometry.invJ)), + ) + else + _, v₂, v₃ = work + D₁v₂ = D[i, 1] ⊠ v₂[1, vt] + D₁v₃ = D[i, 1] ⊠ v₃[1, vt] + @simd for k in 2:Nq + D₁v₂ = D₁v₂ ⊞ D[i, k] ⊠ v₂[k, vt] + D₁v₃ = D₁v₃ ⊞ D[i, k] ⊠ v₃[k, vt] + end + return Geometry.Contravariant23Vector( + ⊟(RecursiveApply.rmul(D₁v₃, local_geometry.invJ)), + RecursiveApply.rmul(D₁v₂, local_geometry.invJ), + ) + end +end Base.@propagate_inbounds function operator_evaluate( op::Curl{(1, 2)}, work, @@ -389,6 +503,54 @@ Base.@propagate_inbounds function operator_evaluate( end end +Base.@propagate_inbounds function operator_evaluate( + op::WeakCurl{(1,)}, + work, + space, + ij, + slabidx, +) + vt = threadIdx().z + i, _ = ij.I + + FT = Spaces.undertype(space) + QS = Spaces.quadrature_style(space) + Nq = Quadratures.degrees_of_freedom(QS) + D = Quadratures.differentiation_matrix(FT, QS) + local_geometry = get_local_geometry(space, ij, slabidx) + + if length(work) == 2 + _, Wv₂ = work + Dᵀ₁Wv₂ = D[1, i] ⊠ Wv₂[1, vt] + for k in 2:Nq + Dᵀ₁Wv₂ = Dᵀ₁Wv₂ ⊞ D[k, i] ⊠ Wv₂[k, vt] + end + return Geometry.Contravariant3Vector( + RecursiveApply.rdiv(⊟(Dᵀ₁Wv₂), local_geometry.WJ), + ) + elseif length(work) == 1 + (Wv₃,) = work + Dᵀ₁Wv₃ = D[1, i] ⊠ Wv₃[1, vt] + for k in 2:Nq + Dᵀ₁Wv₃ = Dᵀ₁Wv₃ ⊞ D[k, i] ⊠ Wv₃[k, vt] + end + return Geometry.Contravariant2Vector( + RecursiveApply.rdiv(Dᵀ₁Wv₃, local_geometry.WJ), + ) + else + _, Wv₂, Wv₃ = work + Dᵀ₁Wv₂ = D[1, i] ⊠ Wv₂[1, vt] + Dᵀ₁Wv₃ = D[1, i] ⊠ Wv₃[1, vt] + @simd for k in 2:Nq + Dᵀ₁Wv₂ = Dᵀ₁Wv₂ ⊞ D[k, i] ⊠ Wv₂[k, vt] + Dᵀ₁Wv₃ = Dᵀ₁Wv₃ ⊞ D[k, i] ⊠ Wv₃[k, vt] + end + return Geometry.Contravariant23Vector( + RecursiveApply.rdiv(Dᵀ₁Wv₃, local_geometry.WJ), + RecursiveApply.rdiv(⊟(Dᵀ₁Wv₂), local_geometry.WJ), + ) + end +end Base.@propagate_inbounds function operator_evaluate( op::WeakCurl{(1, 2)}, work, diff --git a/ext/cuda/topologies_dss.jl b/ext/cuda/topologies_dss.jl index e3fa7df9d7..25dd5caa89 100644 --- a/ext/cuda/topologies_dss.jl +++ b/ext/cuda/topologies_dss.jl @@ -2,7 +2,7 @@ import ClimaCore: DataLayouts, Topologies, Spaces, Fields import ClimaCore.DataLayouts: CartesianFieldIndex using CUDA import ClimaCore.Topologies -import ClimaCore.Topologies: DSSDataTypes, DSSPerimeterTypes +import ClimaCore.Topologies: DSSTypes1D, DSSTypes2D, DSSPerimeterTypes import ClimaCore.Topologies: perimeter_vertex_node_index _max_threads_cuda() = 256 @@ -19,7 +19,7 @@ _configure_threadblock(nitems) = function Topologies.dss_load_perimeter_data!( dev::ClimaComms.CUDADevice, dss_buffer::Topologies.DSSBuffer, - data::DSSDataTypes, + data::DSSTypes2D, perimeter::Topologies.Perimeter2D, ) (; perimeter_data) = dss_buffer @@ -40,7 +40,7 @@ end function dss_load_perimeter_data_kernel!( perimeter_data::DSSPerimeterTypes, - data::DSSDataTypes, + data::DSSTypes2D, perimeter::Topologies.Perimeter2D{Nq}, ) where {Nq} gidx = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x @@ -60,7 +60,7 @@ end function Topologies.dss_unload_perimeter_data!( dev::ClimaComms.CUDADevice, - data::DSSDataTypes, + data::DSSTypes2D, dss_buffer::Topologies.DSSBuffer, perimeter, ) @@ -81,7 +81,7 @@ function Topologies.dss_unload_perimeter_data!( end function dss_unload_perimeter_data_kernel!( - data::DSSDataTypes, + data::DSSTypes2D, perimeter_data::DSSPerimeterTypes, perimeter::Topologies.Perimeter2D{Nq}, ) where {Nq} @@ -195,10 +195,10 @@ end function Topologies.dss_transform!( device::ClimaComms.CUDADevice, perimeter_data::DSSPerimeterTypes, - data::DSSDataTypes, + data::DSSTypes2D, perimeter::Topologies.Perimeter2D, - local_geometry::DSSDataTypes, - dss_weights::DSSDataTypes, + local_geometry::DSSTypes2D, + dss_weights::DSSTypes2D, localelems::AbstractVector{Int}, ) nlocalelems = length(localelems) @@ -240,10 +240,10 @@ end function dss_transform_kernel!( perimeter_data::DSSPerimeterTypes, - data::DSSDataTypes, + data::DSSTypes2D, perimeter::Topologies.Perimeter2D, - local_geometry::DSSDataTypes, - dss_weights::DSSDataTypes, + local_geometry::DSSTypes2D, + dss_weights::DSSTypes2D, localelems::AbstractVector{Int}, ::Val{nlocalelems}, ) where {nlocalelems} @@ -271,8 +271,8 @@ end function Topologies.dss_untransform!( device::ClimaComms.CUDADevice, perimeter_data::DSSPerimeterTypes, - data::DSSDataTypes, - local_geometry::DSSDataTypes, + data::DSSTypes2D, + local_geometry::DSSTypes2D, perimeter::Topologies.Perimeter2D, localelems::AbstractVector{Int}, ) @@ -312,8 +312,8 @@ end function dss_untransform_kernel!( perimeter_data::DSSPerimeterTypes, - data::DSSDataTypes, - local_geometry::DSSDataTypes, + data::DSSTypes2D, + local_geometry::DSSTypes2D, perimeter::Topologies.Perimeter2D, localelems::AbstractVector{Int}, ::Val{nlocalelems}, @@ -582,3 +582,55 @@ function dss_ghost_kernel!( end return nothing end + +function Topologies.dss_1d!( + ::ClimaComms.CUDADevice, + data::DSSTypes1D, + topology::Topologies.IntervalTopology, + local_geometry = nothing, + dss_weights = nothing, +) + (_, _, _, Nv, Nh) = DataLayouts.universal_size(data) + nfaces = Topologies.isperiodic(topology) ? Nh : Nh - 1 + nitems = Nv * nfaces + threads = _max_threads_cuda() + p = linear_partition(nitems, threads) + args = (data, local_geometry, dss_weights, nfaces) + auto_launch!( + dss_1d_kernel!, + args; + threads_s = p.threads, + blocks_s = p.blocks, + ) + return nothing +end + +function dss_1d_kernel!(data, local_geometry, dss_weights, nfaces) + T = eltype(data) + (Ni, _, _, Nv, Nh) = DataLayouts.universal_size(data) + gidx = threadIdx().x + (blockIdx().x - 1) * blockDim().x + if gidx ≤ Nv * nfaces + left_face_elem = cld(gidx, Nv) + level = gidx - (left_face_elem - 1) * Nv + right_face_elem = left_face_elem == Nh ? 1 : left_face_elem + 1 + left_idx = CartesianIndex(Ni, 1, 1, level, left_face_elem) + right_idx = CartesianIndex(1, 1, 1, level, right_face_elem) + val = + Topologies.dss_transform( + data, + local_geometry, + dss_weights, + left_idx, + ) ⊞ Topologies.dss_transform( + data, + local_geometry, + dss_weights, + right_idx, + ) + data[left_idx] = + Topologies.dss_untransform(T, val, local_geometry, left_idx) + data[right_idx] = + Topologies.dss_untransform(T, val, local_geometry, right_idx) + end + return nothing +end diff --git a/src/Grids/spectralelement.jl b/src/Grids/spectralelement.jl index 24e583b7fc..deae263d6c 100644 --- a/src/Grids/spectralelement.jl +++ b/src/Grids/spectralelement.jl @@ -63,6 +63,7 @@ function _SpectralElementGrid1D( ::Val{Nh}, ::Type{horizontal_layout_type}, ) where {Nh, horizontal_layout_type} + DA = ClimaComms.array_type(topology) global_geometry = Geometry.CartesianGlobalGeometry() CoordType = Topologies.coordinate_type(topology) AIdx = Geometry.coordinate_axis(CoordType) @@ -103,12 +104,13 @@ function _SpectralElementGrid1D( end end + device_local_geometry = DataLayouts.rebuild(local_geometry, DA) return SpectralElementGrid1D( topology, quadrature_style, global_geometry, - local_geometry, - compute_dss_weights(local_geometry, topology, quadrature_style), + device_local_geometry, + compute_dss_weights(device_local_geometry, topology, quadrature_style), ) end @@ -584,11 +586,7 @@ function compute_dss_weights(local_geometry, topology, quadrature_style) # Although the weights are defined as WJ / Σ collocated WJ, we can use J # instead of WJ if the weights are symmetric across element boundaries. dss_weights = copy(local_geometry.J) - if topology isa Topologies.IntervalTopology - is_dss_required && Topologies.dss_1d!(topology, dss_weights) - else - is_dss_required && Topologies.dss!(dss_weights, topology) - end + is_dss_required && Topologies.dss!(dss_weights, topology) @. dss_weights = local_geometry.J / dss_weights return dss_weights end diff --git a/src/MatrixFields/matrix_multiplication.jl b/src/MatrixFields/matrix_multiplication.jl index 8ad5cb3395..5df6eb9a83 100644 --- a/src/MatrixFields/matrix_multiplication.jl +++ b/src/MatrixFields/matrix_multiplication.jl @@ -201,6 +201,8 @@ This means that we can express the bounds on the interior values of ``i`` as struct MultiplyColumnwiseBandMatrixField <: Operators.FiniteDifferenceOperator end const ⋅ = MultiplyColumnwiseBandMatrixField() +Operators.strip_space(op::MultiplyColumnwiseBandMatrixField, _) = op + struct TopLeftMatrixCorner <: Operators.AbstractBoundaryCondition end struct BottomRightMatrixCorner <: Operators.AbstractBoundaryCondition end diff --git a/src/MatrixFields/operator_matrices.jl b/src/MatrixFields/operator_matrices.jl index 7eaa097968..e87f098b3d 100644 --- a/src/MatrixFields/operator_matrices.jl +++ b/src/MatrixFields/operator_matrices.jl @@ -90,6 +90,9 @@ function FDOperatorMatrix(op::O) where {O} return FDOperatorMatrix{O}(op) end +Operators.strip_space(op::FDOperatorMatrix, parent_space) = + FDOperatorMatrix(Operators.strip_space(op.op, parent_space)) + struct LazyOneArgFDOperatorMatrix{O <: OneArgFDOperator} <: AbstractLazyOperator op::O end diff --git a/src/Operators/finitedifference.jl b/src/Operators/finitedifference.jl index 2efae3f784..861004f7bb 100644 --- a/src/Operators/finitedifference.jl +++ b/src/Operators/finitedifference.jl @@ -80,6 +80,10 @@ Subtypes should define: """ abstract type AbstractBoundaryCondition end +strip_space(bc::AbstractBoundaryCondition, parent_space) = + hasproperty(bc, :val) ? + unionall_type(typeof(bc))(strip_space(bc.val, parent_space)) : bc + """ NullBoundaryCondition() @@ -204,6 +208,12 @@ has_boundary( ::RightBoundaryWindow{name}, ) where {name} = hasproperty(op.bcs, name) +strip_space(op::FiniteDifferenceOperator, parent_space) = + unionall_type(typeof(op))( + NamedTuple{keys(op.bcs)}( + strip_space_args(values(op.bcs), parent_space), + ), + ) abstract type AbstractStencilStyle <: Fields.AbstractFieldStyle end @@ -272,7 +282,15 @@ function Base.Broadcast.instantiate( return Base.Broadcast.Broadcasted{Style}(bc.f, args, axes) end - +function strip_space(sbc::StencilBroadcasted{Style}, parent_space) where {Style} + current_space = axes(sbc) + new_space = placeholder_space(current_space, parent_space) + return StencilBroadcasted{Style}( + strip_space(sbc.op, current_space), + strip_space_args(sbc.args, current_space), + new_space, + ) +end """ return_eltype(::Op, fields...) @@ -1356,6 +1374,11 @@ struct MonotoneLocalExtrema <: LimiterConstraint end LinVanLeerC2F(; constraint, kwargs...) = LinVanLeerC2F(NamedTuple(kwargs), constraint) +strip_space(op::LinVanLeerC2F, parent_space) = LinVanLeerC2F( + NamedTuple{keys(op.bcs)}(strip_space_args(values(op.bcs), parent_space)), + op.constraint, +) + return_eltype(::LinVanLeerC2F, V, A, dt) = Geometry.Contravariant3Vector{eltype(eltype(V))} @@ -3802,16 +3825,6 @@ function _threaded_copyto!(field_out::Field, bc, Ni::Int, Nj::Int, Nh::Int) return field_out end -function strip_space(bc::StencilBroadcasted{Style}, parent_space) where {Style} - current_space = axes(bc) - new_space = placeholder_space(current_space, parent_space) - return StencilBroadcasted{Style}( - bc.op, - strip_space_args(bc.args, current_space), - new_space, - ) -end - function Base.copyto!( field_out::Field, bc::Union{ diff --git a/src/Operators/operator2stencil.jl b/src/Operators/operator2stencil.jl index 111a7e4f25..9a4715e126 100644 --- a/src/Operators/operator2stencil.jl +++ b/src/Operators/operator2stencil.jl @@ -53,6 +53,9 @@ struct Operator2Stencil{O <: FiniteDifferenceOperator} <: op::O end +strip_space(op::Operator2Stencil, parent_space) = + Operator2Stencil(strip_space(op.op, parent_space)) + extrapolation_increases_bandwidth_error(op_type::Type) = throw( ArgumentError( "Operator2Stencil currently only supports operators whose stencils \ diff --git a/src/Spaces/dss.jl b/src/Spaces/dss.jl index 96caa127f5..2f0ab42c60 100644 --- a/src/Spaces/dss.jl +++ b/src/Spaces/dss.jl @@ -12,7 +12,7 @@ import ..Topologies: fill_send_buffer!, load_from_recv_buffer!, DSSTypesAll, - DSSDataTypes, + DSSTypes2D, DSSPerimeterTypes @@ -102,7 +102,7 @@ function weighted_dss_prepare!(data, space, dss_buffer::Nothing) end function weighted_dss_prepare!( - data::DSSDataTypes, + data::DSSTypes2D, space::Union{ Spaces.SpectralElementSpace2D, Spaces.ExtrudedFiniteDifferenceSpace, @@ -169,7 +169,7 @@ representative ghost vertices which store result of "ghost local" DSS are loaded 4). Start DSS communication with neighboring processes """ function weighted_dss_start!( - data::DSSDataTypes, + data::DSSTypes2D, space::Union{ Spaces.SpectralElementSpace2D, Spaces.ExtrudedFiniteDifferenceSpace, @@ -227,15 +227,16 @@ function weighted_dss_internal!( ) assert_same_eltype(data, dss_buffer) length(parent(data)) == 0 && return nothing + device = ClimaComms.device(topology(hspace)) if hspace isa SpectralElementSpace1D dss_1d!( - topology(hspace), + device, data, + topology(hspace), local_geometry_data(space), dss_weights(space), ) else - device = ClimaComms.device(topology(hspace)) dss_transform!( device, dss_buffer, @@ -303,7 +304,7 @@ weighted_dss_ghost!( function weighted_dss_ghost!( - data::DSSDataTypes, + data::DSSTypes2D, space::Union{AbstractSpectralElementSpace, ExtrudedFiniteDifferenceSpace}, hspace::SpectralElementSpace2D, dss_buffer::DSSBuffer, diff --git a/src/Spaces/extruded.jl b/src/Spaces/extruded.jl index ced77f28d7..cd9e6e88f5 100644 --- a/src/Spaces/extruded.jl +++ b/src/Spaces/extruded.jl @@ -220,15 +220,10 @@ column(space::ExtrudedFiniteDifferenceSpace, i, j, h) = column(space::ExtrudedFiniteDifferenceSpace, i, h) = column(space, Grids.ColumnIndex((i,), h)) -level(space::CenterExtrudedFiniteDifferenceSpace2D, v::Integer) = +level(space::ExtrudedFiniteDifferenceSpace, v) = + space isa ExtrudedFiniteDifferenceSpace3D ? + SpectralElementSpace2D(level(grid(space), v)) : SpectralElementSpace1D(level(grid(space), v)) -level(space::FaceExtrudedFiniteDifferenceSpace2D, v::PlusHalf) = - SpectralElementSpace1D(level(grid(space), v)) -level(space::CenterExtrudedFiniteDifferenceSpace3D, v::Integer) = - SpectralElementSpace2D(level(grid(space), v)) -level(space::FaceExtrudedFiniteDifferenceSpace3D, v::PlusHalf) = - SpectralElementSpace2D(level(grid(space), v)) - nlevels(space::ExtrudedFiniteDifferenceSpace) = size(local_geometry_data(space), 4) diff --git a/src/Topologies/dss.jl b/src/Topologies/dss.jl index 0b2c389d38..35fbd5e1d7 100644 --- a/src/Topologies/dss.jl +++ b/src/Topologies/dss.jl @@ -1,21 +1,14 @@ using .DataLayouts: CartesianFieldIndex -const DSSTypesAll = Union{ - DataLayouts.IFH, - DataLayouts.IHF, - DataLayouts.VIFH, - DataLayouts.VIHF, - DataLayouts.IJFH, - DataLayouts.IJHF, - DataLayouts.VIJFH, - DataLayouts.VIJHF, -} -const DSSDataTypes = Union{ +const DSSTypes1D = + Union{DataLayouts.IFH, DataLayouts.IHF, DataLayouts.VIFH, DataLayouts.VIHF} +const DSSTypes2D = Union{ DataLayouts.IJFH, DataLayouts.IJHF, DataLayouts.VIJFH, DataLayouts.VIJHF, } +const DSSTypesAll = Union{DSSTypes1D, DSSTypes2D} const DSSPerimeterTypes = Union{DataLayouts.VIFH, DataLayouts.VIHF} """ @@ -68,10 +61,10 @@ end Creates a [`DSSBuffer`](@ref) for the field data corresponding to `data` """ create_dss_buffer( - data::DSSDataTypes, + data::DSSTypes2D, topology::Topology2D, - local_geometry::Union{DSSDataTypes, Nothing} = nothing, - dss_weights::Union{DSSDataTypes, Nothing} = nothing, + local_geometry::Union{DSSTypes2D, Nothing} = nothing, + dss_weights::Union{DSSTypes2D, Nothing} = nothing, ) = create_dss_buffer( data, topology, @@ -81,11 +74,11 @@ create_dss_buffer( ) function create_dss_buffer( - data::DSSDataTypes, + data::DSSTypes2D, topology::Topology2D, ::Type{PerimeterLayout}, - local_geometry::Union{DSSDataTypes, Nothing} = nothing, - dss_weights::Union{DSSDataTypes, Nothing} = nothing, + local_geometry::Union{DSSTypes2D, Nothing} = nothing, + dss_weights::Union{DSSTypes2D, Nothing} = nothing, ) where {PerimeterLayout} S = eltype(data) Nij = DataLayouts.get_Nij(data) @@ -210,9 +203,9 @@ Part of [`ClimaCore.Spaces.weighted_dss!`](@ref). function dss_transform!( device::ClimaComms.AbstractDevice, dss_buffer::DSSBuffer, - data::DSSDataTypes, - local_geometry::DSSDataTypes, - dss_weights::DSSDataTypes, + data::DSSTypes2D, + local_geometry::DSSTypes2D, + dss_weights::DSSTypes2D, perimeter::Perimeter2D, localelems::AbstractVector{Int}, ) @@ -279,10 +272,10 @@ Part of [`ClimaCore.Spaces.weighted_dss!`](@ref). function dss_transform!( ::ClimaComms.AbstractCPUDevice, perimeter_data::DSSPerimeterTypes, - data::DSSDataTypes, + data::DSSTypes2D, perimeter::Perimeter2D{Nq}, - local_geometry::DSSDataTypes, - dss_weights::DSSDataTypes, + local_geometry::DSSTypes2D, + dss_weights::DSSTypes2D, localelems::Vector{Int}, ) where {Nq} (_, _, _, nlevels, _) = DataLayouts.universal_size(perimeter_data) @@ -331,8 +324,8 @@ Part of [`ClimaCore.Spaces.weighted_dss!`](@ref). function dss_untransform!( device::ClimaComms.AbstractDevice, dss_buffer::DSSBuffer, - data::DSSDataTypes, - local_geometry::DSSDataTypes, + data::DSSTypes2D, + local_geometry::DSSTypes2D, perimeter::Perimeter2D, localelems::AbstractVector{Int}, ) @@ -373,8 +366,8 @@ Part of [`ClimaCore.Spaces.weighted_dss!`](@ref). function dss_untransform!( ::ClimaComms.AbstractCPUDevice, perimeter_data::DSSPerimeterTypes, - data::DSSDataTypes, - local_geometry::DSSDataTypes, + data::DSSTypes2D, + local_geometry::DSSTypes2D, perimeter::Perimeter2D, localelems::Vector{Int}, ) @@ -397,7 +390,7 @@ end function dss_load_perimeter_data!( ::ClimaComms.AbstractCPUDevice, dss_buffer::DSSBuffer, - data::DSSDataTypes, + data::DSSTypes2D, perimeter::Perimeter2D, ) (; perimeter_data) = dss_buffer @@ -414,7 +407,7 @@ end function dss_unload_perimeter_data!( ::ClimaComms.AbstractCPUDevice, - data::DSSDataTypes, + data::DSSTypes2D, dss_buffer::DSSBuffer, perimeter::Perimeter2D, ) @@ -684,7 +677,14 @@ end Computed unweighted/pure DSS of `data`. """ -function dss!(data::DSSDataTypes, topology::Topology2D) +function dss!(data::DSSTypes1D, topology::IntervalTopology) + Nij = DataLayouts.get_Nij(data) + length(parent(data)) == 0 && return nothing + device = ClimaComms.device(topology) + dss_1d!(device, data, topology) + return nothing +end +function dss!(data::DSSTypes2D, topology::Topology2D) Nij = DataLayouts.get_Nij(data) length(parent(data)) == 0 && return nothing device = ClimaComms.device(topology) @@ -713,52 +713,23 @@ function dss!(data::DSSDataTypes, topology::Topology2D) end function dss_1d!( - htopology::AbstractTopology, - data, - local_geometry_data = nothing, + ::ClimaComms.AbstractCPUDevice, + data::DSSTypes1D, + topology::IntervalTopology, + local_geometry = nothing, dss_weights = nothing, ) - Nq = size(data, 1) - Nv = size(data, 4) - idx1 = CartesianIndex(1, 1, 1, 1, 1) - idx2 = CartesianIndex(Nq, 1, 1, 1, 1) - @inbounds for (elem1, face1, elem2, face2, reversed) in - interior_faces(htopology) - for level in 1:Nv - @assert face1 == 1 && face2 == 2 && !reversed - local_geometry_slab1 = slab(local_geometry_data, level, elem1) - weight_slab1 = slab(dss_weights, level, elem1) - data_slab1 = slab(data, level, elem1) - - local_geometry_slab2 = slab(local_geometry_data, level, elem2) - weight_slab2 = slab(dss_weights, level, elem2) - data_slab2 = slab(data, level, elem2) - val = - dss_transform( - data_slab1, - local_geometry_slab1, - weight_slab1, - idx1, - ) ⊞ dss_transform( - data_slab2, - local_geometry_slab2, - weight_slab2, - idx2, - ) - - data_slab1[idx1] = dss_untransform( - eltype(data_slab1), - val, - local_geometry_slab1, - idx1, - ) - data_slab2[idx2] = dss_untransform( - eltype(data_slab2), - val, - local_geometry_slab2, - idx2, - ) - end + T = eltype(data) + (Ni, _, _, Nv, Nh) = DataLayouts.universal_size(data) + nfaces = isperiodic(topology) ? Nh : Nh - 1 + @inbounds for left_face_elem in 1:nfaces, level in 1:Nv + right_face_elem = left_face_elem == Nh ? 1 : left_face_elem + 1 + left_idx = CartesianIndex(Ni, 1, 1, level, left_face_elem) + right_idx = CartesianIndex(1, 1, 1, level, right_face_elem) + val = + dss_transform(data, local_geometry, dss_weights, left_idx) ⊞ + dss_transform(data, local_geometry, dss_weights, right_idx) + data[left_idx] = dss_untransform(T, val, local_geometry, left_idx) + data[right_idx] = dss_untransform(T, val, local_geometry, right_idx) end - return data end diff --git a/test/DataLayouts/unit_mapreduce.jl b/test/DataLayouts/unit_mapreduce.jl index 511b92dc7b..7ee1271f00 100644 --- a/test/DataLayouts/unit_mapreduce.jl +++ b/test/DataLayouts/unit_mapreduce.jl @@ -163,6 +163,23 @@ end # data = DataLayouts.IH1JH2{S}(ArrayType{FT}, zeros; Nij); test_mapreduce_2!(context, data_view(data)) # TODO: test end +@testset "mapreduce extruded 1D data" begin + ArrayType = ClimaComms.array_type(device) + FT = Float64 + S = Tuple{FT, FT} + Nv = 4 + Ni = 3 + Nh = 5 + data = IFH{S}(ArrayType{FT}, zeros; Ni, Nh) + test_mapreduce_2!(context, data) + data = IHF{S}(ArrayType{FT}, zeros; Ni, Nh) + test_mapreduce_2!(context, data) + data = VIFH{S}(ArrayType{FT}, zeros; Nv, Ni, Nh) + test_mapreduce_2!(context, data) + data = VIHF{S}(ArrayType{FT}, zeros; Nv, Ni, Nh) + test_mapreduce_2!(context, data) +end + @testset "mapreduce with space with some non-round blocks" begin # https://github.com/CliMA/ClimaCore.jl/issues/2097 space = ClimaCore.CommonSpaces.RectangleXYSpace(; diff --git a/test/Operators/hybrid/unit_2d.jl b/test/Operators/hybrid/unit_2d.jl index c0432afc82..3ee66c0e80 100644 --- a/test/Operators/hybrid/unit_2d.jl +++ b/test/Operators/hybrid/unit_2d.jl @@ -33,8 +33,10 @@ end face_field = sin.(Fields.coordinate_field(hv_face_space).z) for npoints in (3, 8) - M_center = Operators.matrix_interpolate(center_field, npoints) - M_face = Operators.matrix_interpolate(face_field, npoints) + center_field_cpu = Adapt.adapt(Array, center_field) + face_field_cpu = Adapt.adapt(Array, face_field) + M_center = Operators.matrix_interpolate(center_field_cpu, npoints) + M_face = Operators.matrix_interpolate(face_field_cpu, npoints) @test size(M_center) == (10, 10 * npoints) @test size(M_face) == (10 + 1, 10 * npoints) end diff --git a/test/Operators/hybrid/utils_2d.jl b/test/Operators/hybrid/utils_2d.jl index 8ff92e7cdc..b8f508adb3 100644 --- a/test/Operators/hybrid/utils_2d.jl +++ b/test/Operators/hybrid/utils_2d.jl @@ -15,6 +15,7 @@ using Test using StaticArrays, IntervalSets, LinearAlgebra +import Adapt import ClimaComms ClimaComms.@import_required_backends diff --git a/test/Operators/unit_operators_examples.jl b/test/Operators/unit_operators_examples.jl index 534b94855a..3770fb84fc 100644 --- a/test/Operators/unit_operators_examples.jl +++ b/test/Operators/unit_operators_examples.jl @@ -57,21 +57,12 @@ grid = Grids.ExtrudedFiniteDifferenceGrid( ) center_space = Spaces.CenterExtrudedFiniteDifferenceSpace(grid) face_space = Spaces.FaceExtrudedFiniteDifferenceSpace(grid) - -# Create fields and show that it fails ᶜgradᵥ = Operators.GradientF2C() level_field = Fields.level(Fields.Field(Float64, face_space), Utilities.half) ᶠscalar_field = Fields.Field(Float64, face_space) -# Does not work: -using Test -@testset "Broken broadcast expression on GPUs" begin - if device isa ClimaComms.CUDADevice - @test_broken begin - @. ᶜgradᵥ(level_field + ᶠscalar_field) - end - else - @. ᶜgradᵥ(level_field + ᶠscalar_field) - end -end +using Test +@testset "Example broadcast expression on GPUs" begin + @test !isnothing(@. ᶜgradᵥ(level_field + ᶠscalar_field)) +end # Formerly broken; fixed in CliMA/ClimaCore.jl#2172 diff --git a/test/Spaces/opt_spaces.jl b/test/Spaces/opt_spaces.jl index 192b956bae..7003c9deb5 100644 --- a/test/Spaces/opt_spaces.jl +++ b/test/Spaces/opt_spaces.jl @@ -35,7 +35,7 @@ end #! format: off if ClimaComms.device(context) isa ClimaComms.CUDADevice test_n_failures(91, TU.PointSpace, context) - test_n_failures(141, TU.SpectralElementSpace1D, context) + test_n_failures(825, TU.SpectralElementSpace1D, context) test_n_failures(1141, TU.SpectralElementSpace2D, context) test_n_failures(3, TU.ColumnCenterFiniteDifferenceSpace, context) test_n_failures(4, TU.ColumnFaceFiniteDifferenceSpace, context) @@ -44,7 +44,7 @@ end test_n_failures(1146, TU.FaceExtrudedFiniteDifferenceSpace, context) else test_n_failures(0, TU.PointSpace, context) - test_n_failures(137, TU.SpectralElementSpace1D, context) + test_n_failures(150, TU.SpectralElementSpace1D, context) test_n_failures(310, TU.SpectralElementSpace2D, context) test_n_failures(4, TU.ColumnCenterFiniteDifferenceSpace, context) test_n_failures(5, TU.ColumnFaceFiniteDifferenceSpace, context) diff --git a/test/Spaces/unit_spaces.jl b/test/Spaces/unit_spaces.jl index 4a691db084..b1f2e91d43 100644 --- a/test/Spaces/unit_spaces.jl +++ b/test/Spaces/unit_spaces.jl @@ -53,13 +53,14 @@ on_gpu = ClimaComms.device() isa ClimaComms.CUDADevice @test eltype(coord_data) == Geometry.XPoint{Float64} @test DataLayouts.farray_size(Spaces.coordinates_data(space)) == (4, 1, 1) - coord_slab = slab(Spaces.coordinates_data(space), 1) + coord_slab = Adapt.adapt(Array, slab(Spaces.coordinates_data(space), 1)) @test coord_slab[slab_index(1)] == Geometry.XPoint{FT}(-3) @test typeof(coord_slab[slab_index(4)]) == Geometry.XPoint{FT} @test coord_slab[slab_index(4)].x ≈ FT(5) - local_geometry_slab = slab(Spaces.local_geometry_data(space), 1) - dss_weights_slab = slab(space.grid.dss_weights, 1) + local_geometry_slab = + Adapt.adapt(Array, slab(Spaces.local_geometry_data(space), 1)) + dss_weights_slab = Adapt.adapt(Array, slab(space.grid.dss_weights, 1)) for i in 1:4 @test Geometry.components(local_geometry_slab[slab_index(i)].∂x∂ξ) ≈ @@ -79,8 +80,8 @@ on_gpu = ClimaComms.device() isa ClimaComms.CUDADevice point_space = Spaces.column(space, 1, 1) @test point_space isa Spaces.PointSpace - @test Spaces.coordinates_data(point_space)[] == - Spaces.column(coord_data, 1, 1)[] + @test parent(Spaces.coordinates_data(point_space)) == + parent(Spaces.column(coord_data, 1, 1)) @test Spaces.local_geometry_type(typeof(point_space)) <: Geometry.LocalGeometry end