CliMA · charleskawczynski · Oct 4, 2024 · Oct 3, 2024
diff --git a/flame/flame.jl b/flame/flame.jl
@@ -16,16 +16,13 @@ X = get_arrays(:x, arr_size, AType)
 Y = get_arrays(:y, arr_size, AType)
 
 function perf_kernel_fused!(X, Y)
-    (; x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) = X
-    (; y1, y2, y3, y4, y5, y6, y7, y8, y9, y10) = Y
+    (; x1, x2, x3, x4) = X
+    (; y1, y2, y3, y4) = Y
     @fused_direct begin
         @. y1 = x1 + x2 + x3 + x4
-        @. y2 = x2 + x3 + x4 + x5
-        @. y3 = x3 + x4 + x5 + x6
-        @. y4 = x4 + x5 + x6 + x7
-        @. y5 = x5 + x6 + x7 + x8
-        @. y6 = x6 + x7 + x8 + x9
-        @. y7 = x7 + x8 + x9 + x10
+        @. y2 = x1 * x2 * x3 * x4
+        @. y3 = x1 + x2 - x3 + x4
+        @. y4 = x1 * x2 + x3 * x4
     end
 end
 

diff --git a/test/execution/bm_fused_reads_vs_hard_coded.jl b/test/execution/bm_fused_reads_vs_hard_coded.jl
@@ -11,16 +11,13 @@ import MultiBroadcastFusion as MBF
 perf_kernel_hard_coded!(X, Y) = perf_kernel_hard_coded!(X, Y, MBF.device(X.x1))
 
 function perf_kernel_hard_coded!(X, Y, ::MBF.CPU)
-    (; x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) = X
-    (; y1, y2, y3, y4, y5, y6, y7, y8, y9, y10) = Y
+    (; x1, x2, x3, x4) = X
+    (; y1, y2, y3, y4) = Y
     @inbounds for i in eachindex(x1)
         y1[i] = x1[i] + x2[i] + x3[i] + x4[i]
-        y2[i] = x2[i] + x3[i] + x4[i] + x5[i]
-        y3[i] = x3[i] + x4[i] + x5[i] + x6[i]
-        y4[i] = x4[i] + x5[i] + x6[i] + x7[i]
-        y5[i] = x5[i] + x6[i] + x7[i] + x8[i]
-        y6[i] = x6[i] + x7[i] + x8[i] + x9[i]
-        y7[i] = x7[i] + x8[i] + x9[i] + x10[i]
+        y2[i] = x1[i] * x2[i] * x3[i] * x4[i]
+        y3[i] = x1[i] + x2[i] - x3[i] + x4[i]
+        y4[i] = x1[i] * x2[i] + x3[i] * x4[i]
     end
 end
 
@@ -40,18 +37,17 @@ use_cuda = @isdefined(CUDA) && CUDA.has_cuda() # will be true if you first run `
         )
     end
     function knl_multi_copyto_hard_coded!(X, Y, ::Val{nitems}) where {nitems}
-        (; x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) = X
-        (; y1, y2, y3, y4, y5, y6, y7, y8, y9, y10) = Y
-        idx = CUDA.threadIdx().x + (CUDA.blockIdx().x - 1) * CUDA.blockDim().x
+        (; x1, x2, x3, x4) = X
+        (; y1, y2, y3, y4) = Y
+        i =
+            CUDA.threadIdx().x +
+            (CUDA.blockIdx().x - Int32(1)) * CUDA.blockDim().x
         @inbounds begin
-            if idx ≤ nitems
-                y1[idx] = x1[idx] + x2[idx] + x3[idx] + x4[idx]
-                y2[idx] = x2[idx] + x3[idx] + x4[idx] + x5[idx]
-                y3[idx] = x3[idx] + x4[idx] + x5[idx] + x6[idx]
-                y4[idx] = x4[idx] + x5[idx] + x6[idx] + x7[idx]
-                y5[idx] = x5[idx] + x6[idx] + x7[idx] + x8[idx]
-                y6[idx] = x6[idx] + x7[idx] + x8[idx] + x9[idx]
-                y7[idx] = x7[idx] + x8[idx] + x9[idx] + x10[idx]
+            if i ≤ nitems
+                y1[i] = x1[i] + x2[i] + x3[i] + x4[i]
+                y2[i] = x1[i] * x2[i] * x3[i] * x4[i]
+                y3[i] = x1[i] + x2[i] - x3[i] + x4[i]
+                y4[i] = x1[i] * x2[i] + x3[i] * x4[i]
             end
         end
         return nothing
@@ -61,31 +57,25 @@ end
 # ===========================================
 
 function perf_kernel_unfused!(X, Y)
-    (; x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) = X
-    (; y1, y2, y3, y4, y5, y6, y7, y8, y9, y10) = Y
-    # 7 writes; 10 unique reads
-    # 7 writes; 28 reads including redundant ones
+    (; x1, x2, x3, x4) = X
+    (; y1, y2, y3, y4) = Y
+    # 4 writes; 4 unique reads
+    # 4 writes; 16 reads including redundant ones
     @. y1 = x1 + x2 + x3 + x4
-    @. y2 = x2 + x3 + x4 + x5
-    @. y3 = x3 + x4 + x5 + x6
-    @. y4 = x4 + x5 + x6 + x7
-    @. y5 = x5 + x6 + x7 + x8
-    @. y6 = x6 + x7 + x8 + x9
-    @. y7 = x7 + x8 + x9 + x10
+    @. y2 = x1 * x2 * x3 * x4
+    @. y3 = x1 + x2 - x3 + x4
+    @. y4 = x1 * x2 + x3 * x4
     return nothing
 end
 
 function perf_kernel_fused!(X, Y)
-    (; x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) = X
-    (; y1, y2, y3, y4, y5, y6, y7, y8, y9, y10) = Y
+    (; x1, x2, x3, x4) = X
+    (; y1, y2, y3, y4) = Y
     MBF.@fused_direct begin
         @. y1 = x1 + x2 + x3 + x4
-        @. y2 = x2 + x3 + x4 + x5
-        @. y3 = x3 + x4 + x5 + x6
-        @. y4 = x4 + x5 + x6 + x7
-        @. y5 = x5 + x6 + x7 + x8
-        @. y6 = x6 + x7 + x8 + x9
-        @. y7 = x7 + x8 + x9 + x10
+        @. y2 = x1 * x2 * x3 * x4
+        @. y3 = x1 + x2 - x3 + x4
+        @. y4 = x1 * x2 + x3 * x4
     end
 end
 
@@ -95,7 +85,7 @@ AType = use_cuda ? CUDA.CuArray : Array
 device_name = use_cuda ? CUDA.name(CUDA.device()) : "CPU"
 bm = Benchmark(; device_name, float_type = Float32)
 
-problem_size = (50, 5, 5, 6, 50)
+problem_size = (50, 5, 5, 6, 5400)
 
 array_size = problem_size # array
 X = get_arrays(:x, AType, bm.float_type, array_size)
@@ -118,7 +108,7 @@ push_benchmark!(
     perf_kernel_unfused!,
     X,
     Y;
-    n_reads_writes = 7 + 10,
+    n_reads_writes = 4 + 4,
     problem_size = array_size,
 )
 push_benchmark!(
@@ -127,7 +117,7 @@ push_benchmark!(
     perf_kernel_fused!,
     X,
     Y;
-    n_reads_writes = 7 + 10,
+    n_reads_writes = 4 + 4,
     problem_size = array_size,
 )
 use_cuda && push_benchmark!(
@@ -136,7 +126,7 @@ use_cuda && push_benchmark!(
     perf_kernel_hard_coded!,
     X,
     Y;
-    n_reads_writes = 7 + 10,
+    n_reads_writes = 4 + 4,
     problem_size = array_size,
 )
 
@@ -161,7 +151,7 @@ push_benchmark!(
     perf_kernel_unfused!,
     X,
     Y;
-    n_reads_writes = 7 + 10,
+    n_reads_writes = 4 + 4,
     problem_size = array_size,
 )
 push_benchmark!(
@@ -170,7 +160,7 @@ push_benchmark!(
     perf_kernel_fused!,
     X,
     Y;
-    n_reads_writes = 7 + 10,
+    n_reads_writes = 4 + 4,
     problem_size = array_size,
 )
 use_cuda && push_benchmark!(
@@ -179,7 +169,7 @@ use_cuda && push_benchmark!(
     perf_kernel_hard_coded!,
     X,
     Y;
-    n_reads_writes = 7 + 10,
+    n_reads_writes = 4 + 4,
     problem_size = array_size,
 )
 

diff --git a/test/execution/bm_fused_shared_reads.jl b/test/execution/bm_fused_shared_reads.jl
@@ -9,28 +9,22 @@ include("utils_benchmark.jl")
 import MultiBroadcastFusion as MBF
 
 function perf_kernel_shared_reads_unfused!(X, Y)
-    (; x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) = X
-    (; y1, y2, y3, y4, y5, y6, y7, y8, y9, y10) = Y
+    (; x1, x2, x3, x4) = X
+    (; y1, y2, y3, y4) = Y
     @. y1 = x1 + x2 + x3 + x4
-    @. y2 = x2 + x3 + x4 + x5
-    @. y3 = x3 + x4 + x5 + x6
-    @. y4 = x4 + x5 + x6 + x7
-    @. y5 = x5 + x6 + x7 + x8
-    @. y6 = x6 + x7 + x8 + x9
-    @. y7 = x7 + x8 + x9 + x10
+    @. y2 = x1 * x2 * x3 * x4
+    @. y3 = x1 + x2 - x3 + x4
+    @. y4 = x1 * x2 + x3 * x4
 end
 
 function perf_kernel_shared_reads_fused!(X, Y)
-    (; x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) = X
-    (; y1, y2, y3, y4, y5, y6, y7, y8, y9, y10) = Y
+    (; x1, x2, x3, x4) = X
+    (; y1, y2, y3, y4) = Y
     MBF.@fused_direct begin
         @. y1 = x1 + x2 + x3 + x4
-        @. y2 = x2 + x3 + x4 + x5
-        @. y3 = x3 + x4 + x5 + x6
-        @. y4 = x4 + x5 + x6 + x7
-        @. y5 = x5 + x6 + x7 + x8
-        @. y6 = x6 + x7 + x8 + x9
-        @. y7 = x7 + x8 + x9 + x10
+        @. y2 = x1 * x2 * x3 * x4
+        @. y3 = x1 + x2 - x3 + x4
+        @. y4 = x1 * x2 + x3 * x4
     end
 end
 
@@ -39,7 +33,7 @@ use_cuda = @isdefined(CUDA) && CUDA.has_cuda() # will be true if you first run `
 AType = use_cuda ? CUDA.CuArray : Array
 device_name = use_cuda ? CUDA.name(CUDA.device()) : "CPU"
 bm = Benchmark(; device_name, float_type = Float32)
-problem_size = (50, 5, 5, 6, 50)
+problem_size = (50, 5, 5, 6, 5400)
 
 array_size = problem_size # array
 X = get_arrays(:x, AType, bm.float_type, array_size)
@@ -56,7 +50,7 @@ push_benchmark!(
     perf_kernel_shared_reads_unfused!,
     X,
     Y;
-    n_reads_writes = 7 + 10,
+    n_reads_writes = 4 + 4,
     problem_size = array_size,
 )
 push_benchmark!(
@@ -65,7 +59,7 @@ push_benchmark!(
     perf_kernel_shared_reads_fused!,
     X,
     Y;
-    n_reads_writes = 7 + 10,
+    n_reads_writes = 4 + 4,
     problem_size = array_size,
 )
 
@@ -84,7 +78,7 @@ push_benchmark!(
     perf_kernel_shared_reads_unfused!,
     X,
     Y;
-    n_reads_writes = 7 + 10,
+    n_reads_writes = 4 + 4,
     problem_size = array_size,
 )
 push_benchmark!(
@@ -93,7 +87,7 @@ push_benchmark!(
     perf_kernel_shared_reads_fused!,
     X,
     Y;
-    n_reads_writes = 7 + 10,
+    n_reads_writes = 4 + 4,
     problem_size = array_size,
 )
 

diff --git a/test/execution/bm_fused_shared_reads_writes.jl b/test/execution/bm_fused_shared_reads_writes.jl
@@ -9,48 +9,35 @@ include("utils_benchmark.jl")
 import MultiBroadcastFusion as MBF
 
 function perf_kernel_shared_reads_writes_unfused!(X, Y)
-    (; x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) = X
-    (; y1, y2, y3, y4, y5, y6, y7, y8, y9, y10) = Y
-    # Totoal: 10 writes, 15 reads, and 5 read/write overlaps
-    @. y1 = x1 + x6
-    @. y2 = x2 + x7
-    @. y3 = x3 + x8
-    @. y4 = x4 + x9
-    @. y5 = x5 + x10
-    @. y6 = y1
-    @. y7 = y2
-    @. y8 = y3
-    @. y9 = y4
-    @. y10 = y5
+    (; x1, x2, x3, x4) = X
+    (; y1, y2, y3, y4) = Y
+    # Total: 4 writes, 8 reads (including redundants)
+    # Theoretical minimum: 4 + 4 read/writes
+    @. y1 = x1 + x3
+    @. y2 = x2 + x4
+    @. y3 = y1 + x4
+    @. y4 = y2 + y3
 end
 
 function perf_kernel_shared_reads_writes_fused!(X, Y)
-    (; x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) = X
-    (; y1, y2, y3, y4, y5, y6, y7, y8, y9, y10) = Y
+    (; x1, x2, x3, x4) = X
+    (; y1, y2, y3, y4) = Y
+    # Total: 4 writes, 8 reads (including redundants)
+    # Theoretical minimum: 4 + 4 read/writes
     MBF.@fused_direct begin
-        @. y1 = x1 + x6
-        @. y2 = x2 + x7
-        @. y3 = x3 + x8
-        @. y4 = x4 + x9
-        @. y5 = x5 + x10
-        @. y6 = y1
-        @. y7 = y2
-        @. y8 = y3
-        @. y9 = y4
-        @. y10 = y5
+        @. y1 = x1 + x3
+        @. y2 = x2 + x4
+        @. y3 = y1 + x4
+        @. y4 = y2 + y3
     end
 end
 
 @static get(ENV, "USE_CUDA", nothing) == "true" && using CUDA
 use_cuda = @isdefined(CUDA) && CUDA.has_cuda() # will be true if you first run `using CUDA`
 AType = use_cuda ? CUDA.CuArray : Array
 device_name = use_cuda ? CUDA.name(CUDA.device()) : "CPU"
-bm = Benchmark(;
-    problem_size = (prod((50, 5, 5, 6, 50)),),
-    device_name,
-    float_type = Float32,
-)
-problem_size = (50, 5, 5, 6, 50)
+bm = Benchmark(; device_name, float_type = Float32)
+problem_size = (50, 5, 5, 6, 5400)
 
 array_size = problem_size # array
 X = get_arrays(:x, AType, bm.float_type, array_size)
@@ -68,7 +55,7 @@ push_benchmark!(
     perf_kernel_shared_reads_writes_unfused!,
     X,
     Y;
-    n_reads_writes = 10 + 15,
+    n_reads_writes = 4 + 4,
     problem_size = array_size,
 )
 push_benchmark!(
@@ -77,7 +64,7 @@ push_benchmark!(
     perf_kernel_shared_reads_writes_fused!,
     X,
     Y;
-    n_reads_writes = 10 + 15,
+    n_reads_writes = 4 + 4,
     problem_size = array_size,
 )
 
@@ -97,7 +84,7 @@ push_benchmark!(
     perf_kernel_shared_reads_writes_unfused!,
     X,
     Y;
-    n_reads_writes = 10 + 15,
+    n_reads_writes = 4 + 4,
     problem_size = array_size,
 )
 push_benchmark!(
@@ -106,7 +93,7 @@ push_benchmark!(
     perf_kernel_shared_reads_writes_fused!,
     X,
     Y;
-    n_reads_writes = 10 + 15,
+    n_reads_writes = 4 + 4,
     problem_size = array_size,
 )
 

diff --git a/test/execution/utils_setup.jl b/test/execution/utils_setup.jl
@@ -1,6 +1,6 @@
 get_array(AType, FT, s) = AType(zeros(FT, s...))
 
-function get_arrays(sym, AType, FT, s, n = 10)
+function get_arrays(sym, AType, FT, s, n = 4)
     println("array_type = $AType")
     fn = ntuple(i -> Symbol(sym, i), n)
     return (; zip(fn, ntuple(_ -> get_array(AType, FT, s), n))...)