CliMA · dennisYatunin · Feb 3, 2025 · Feb 5, 2025 · Feb 7, 2025
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -358,8 +358,6 @@ steps:
       - label: "Unit: distributed remapping with CUDA (1 process)"
         key: distributed_remapping_gpu_1proc
         command: "julia --color=yes --check-bounds=yes --project=.buildkite test/Remapping/distributed_remapping.jl"
-        env:
-          CLIMACOMMS_DEVICE: "CUDA"
         env:
           CLIMACOMMS_DEVICE: "CUDA"
         agents:
@@ -609,6 +607,16 @@ steps:
            - "julia --color=yes --check-bounds=yes --project=.buildkite test/Operators/hybrid/unit_2d.jl"
            - "julia --color=yes --check-bounds=yes --project=.buildkite test/Operators/hybrid/convergence_2d.jl"
 
+      - label: "Unit: hyb ops 2d CUDA"
+        key: unit_hyb_ops_2d_cuda
+        command:
+           - "julia --color=yes --check-bounds=yes --project=.buildkite test/Operators/hybrid/unit_2d.jl"
+           - "julia --color=yes --check-bounds=yes --project=.buildkite test/Operators/hybrid/convergence_2d.jl"
+        env:
+          CLIMACOMMS_DEVICE: "CUDA"
+        agents:
+          slurm_gpus: 1
+
       - label: "Unit: hyb ops 3d"
         key: unit_hyb_ops_3d
         command:

diff --git a/ext/cuda/data_layouts_mapreduce.jl b/ext/cuda/data_layouts_mapreduce.jl
@@ -24,13 +24,7 @@ end
 function mapreduce_cuda(
     f,
     op,
-    data::Union{
-        DataLayouts.VF,
-        DataLayouts.IJFH,
-        DataLayouts.IJHF,
-        DataLayouts.VIJFH,
-        DataLayouts.VIJHF,
-    };
+    data::DataLayouts.AbstractData;
     weighted_jacobian = OnesArray(parent(data)),
     opargs...,
 )
@@ -132,9 +126,9 @@ function mapreduce_cuda_kernel!(
     gidx = _get_gidx(tidx, bidx, effective_blksize)
     reduction = CUDA.CuStaticSharedArray(T, shmemsize)
     reduction[tidx] = 0
-    (Nij, _, _, Nv, Nh) = DataLayouts.universal_size(us)
+    (Ni, Nj, _, Nv, Nh) = DataLayouts.universal_size(us)
     Nf = 1 # a view into `fidx` always gives a size of Nf = 1
-    nitems = Nv * Nij * Nij * Nf * Nh
+    nitems = Nv * Ni * Nj * Nf * Nh
 
     # load shmem
     if gidx ≤ nitems

diff --git a/ext/cuda/data_layouts_threadblock.jl b/ext/cuda/data_layouts_threadblock.jl
@@ -213,15 +213,15 @@ end
     us::DataLayouts.UniversalSize,
     n_max_threads::Integer,
 )
-    (Nij, _, _, _, Nh) = DataLayouts.universal_size(us)
+    (Ni, Nj, _, _, Nh) = DataLayouts.universal_size(us)
     Nh_thread = min(
-        Int(fld(n_max_threads, Nij * Nij)),
+        Int(fld(n_max_threads, Ni * Nj)),
         maximum_allowable_threads()[3],
         Nh,
     )
     Nh_blocks = cld(Nh, Nh_thread)
-    @assert prod((Nij, Nij, Nh_thread)) ≤ n_max_threads "threads,n_max_threads=($(prod((Nij, Nij, Nh_thread))),$n_max_threads)"
-    return (; threads = (Nij, Nij, Nh_thread), blocks = (Nh_blocks,))
+    @assert prod((Ni, Nj, Nh_thread)) ≤ n_max_threads "threads,n_max_threads=($(prod((Ni, Nj, Nh_thread))),$n_max_threads)"
+    return (; threads = (Ni, Nj, Nh_thread), blocks = (Nh_blocks,))
 end
 @inline function columnwise_universal_index(us::UniversalSize)
     (i, j, th) = CUDA.threadIdx()
@@ -241,9 +241,9 @@ end
     n_max_threads::Integer;
     Nnames,
 )
-    (Nij, _, _, _, Nh) = DataLayouts.universal_size(us)
-    @assert prod((Nij, Nij, Nnames)) ≤ n_max_threads "threads,n_max_threads=($(prod((Nij, Nij, Nnames))),$n_max_threads)"
-    return (; threads = (Nij, Nij, Nnames), blocks = (Nh,))
+    (Ni, Nj, _, _, Nh) = DataLayouts.universal_size(us)
+    @assert prod((Ni, Nj, Nnames)) ≤ n_max_threads "threads,n_max_threads=($(prod((Ni, Nj, Nnames))),$n_max_threads)"
+    return (; threads = (Ni, Nj, Nnames), blocks = (Nh,))
 end
 @inline function multiple_field_solve_universal_index(us::UniversalSize)
     (i, j, iname) = CUDA.threadIdx()
@@ -258,12 +258,12 @@ end
     us::DataLayouts.UniversalSize,
     n_max_threads::Integer = 256;
 )
-    (Nq, _, _, Nv, Nh) = DataLayouts.universal_size(us)
-    Nvthreads = min(fld(n_max_threads, Nq * Nq), maximum_allowable_threads()[3])
+    (Ni, Nj, _, Nv, Nh) = DataLayouts.universal_size(us)
+    Nvthreads = min(fld(n_max_threads, Ni * Nj), maximum_allowable_threads()[3])
     Nvblocks = cld(Nv, Nvthreads)
-    @assert prod((Nq, Nq, Nvthreads)) ≤ n_max_threads "threads,n_max_threads=($(prod((Nq, Nq, Nvthreads))),$n_max_threads)"
-    @assert Nq * Nq ≤ n_max_threads
-    return (; threads = (Nq, Nq, Nvthreads), blocks = (Nh, Nvblocks), Nvthreads)
+    @assert prod((Ni, Nj, Nvthreads)) ≤ n_max_threads "threads,n_max_threads=($(prod((Ni, Nj, Nvthreads))),$n_max_threads)"
+    @assert Ni * Nj ≤ n_max_threads
+    return (; threads = (Ni, Nj, Nvthreads), blocks = (Nh, Nvblocks), Nvthreads)
 end
 @inline function spectral_universal_index(space::Spaces.AbstractSpace)
     i = threadIdx().x