Skip to content

Commit 1bc9396

Browse files
authored
[test] Add SG reduction e2e test cases. (#1121)
1 parent 7869696 commit 1bc9396

File tree

2 files changed

+168
-0
lines changed

2 files changed

+168
-0
lines changed
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
// RUN: %python_executable %imex_runner --requires=mlir-levelzero-runtime,spirv-backend -i %s --pass-pipeline-file=%p/xegpu-to-llvm.pp \
2+
// RUN: --runner mlir-runner -e main \
3+
// RUN: --entry-point-result=void \
4+
// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%mlir_levelzero_runtime --filecheck
5+
module @gemm attributes {gpu.container_module} {
6+
gpu.module @kernel {
7+
gpu.func @reduce_broadcast(%in: memref<16x16xf16>, %c: memref<1x16xf16>) kernel attributes {intel_reqd_sub_group_size = 16 : i32} {
8+
%c0 = arith.constant 0 : index
9+
%c8 = arith.constant 8 : index
10+
%cst = arith.constant dense<0.0> : vector<16xf16>
11+
%in_tdesc = xegpu.create_nd_tdesc %in: memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
12+
%c_tdesc = xegpu.create_nd_tdesc %c : memref<1x16xf16> -> !xegpu.tensor_desc<1x16xf16>
13+
%in_val = xegpu.load_nd %in_tdesc[%c0, %c0] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
14+
%reduce = vector.multi_reduction <add>, %in_val, %cst [0] : vector<16x16xf16> to vector<16xf16>
15+
%reduce_1 = vector.shape_cast %reduce : vector<16xf16> to vector<1x16xf16>
16+
xegpu.store_nd %reduce_1, %c_tdesc[%c0, %c0] : vector<1x16xf16>, !xegpu.tensor_desc<1x16xf16>
17+
gpu.return
18+
}
19+
}
20+
21+
func.func @test(%in: memref<16x16xf16>, %c: memref<1x16xf16>) -> memref<1x16xf16> attributes {llvm.emit_c_interface} {
22+
%c1 = arith.constant 1 : index
23+
%c16 = arith.constant 16 : index
24+
%c32 = arith.constant 32 : index
25+
%memref_in = gpu.alloc () : memref<16x16xf16>
26+
gpu.memcpy %memref_in, %in : memref<16x16xf16>, memref<16x16xf16>
27+
%memref_out = gpu.alloc () : memref<1x16xf16>
28+
gpu.memcpy %memref_out, %c : memref<1x16xf16>, memref<1x16xf16>
29+
gpu.launch_func @kernel::@reduce_broadcast blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1) args(%memref_in : memref<16x16xf16>, %memref_out : memref<1x16xf16>)
30+
gpu.wait
31+
gpu.memcpy %c, %memref_out : memref<1x16xf16>, memref<1x16xf16>
32+
gpu.dealloc %memref_in : memref<16x16xf16>
33+
gpu.dealloc %memref_out : memref<1x16xf16>
34+
return %c : memref<1x16xf16>
35+
}
36+
37+
func.func @main() attributes {llvm.emit_c_interface} {
38+
%c0 = arith.constant 0 : index
39+
%c1 = arith.constant 1 : index
40+
%c16 = arith.constant 16 : index
41+
%in = memref.alloc() : memref<16x16xf16>
42+
%out = memref.alloc() : memref<1x16xf16>
43+
%out_host = memref.alloc() : memref<16xf32>
44+
// Fill input with random values
45+
%in_cast = memref.cast %in : memref<16x16xf16> to memref<*xf16>
46+
%lower = arith.constant 0.0 : f32
47+
%upper = arith.constant 5.0 : f32
48+
%gen_int = arith.constant 1 : i1
49+
call @fillResource1DRandomF16(%in_cast, %lower, %upper, %gen_int) : (memref<*xf16>, f32, f32, i1) -> ()
50+
51+
// CPU version.
52+
%c0_f16 = arith.constant 0.0 : f16
53+
%cst = arith.constant dense<1.0> : vector<16xf16>
54+
scf.for %i = %c0 to %c16 step %c1 {
55+
%col = vector.transfer_read %in[%c0, %i], %c0_f16 : memref<16x16xf16>, vector<16x1xf16>
56+
%col_16 = vector.shape_cast %col : vector<16x1xf16> to vector<16xf16>
57+
%reduce = vector.reduction <add>, %col_16 : vector<16xf16> into f16
58+
%reduce_f32 = arith.extf %reduce : f16 to f32
59+
memref.store %reduce_f32, %out_host[%i] : memref<16xf32>
60+
}
61+
%out_host_cast = memref.cast %out_host : memref<16xf32> to memref<*xf32>
62+
// GPU version.
63+
%gpu_out = call @test(%in, %out) : (memref<16x16xf16>, memref<1x16xf16>) -> memref<1x16xf16>
64+
%gpu_out_cast = memref.cast %gpu_out : memref<1x16xf16> to memref<*xf16>
65+
66+
// call @printMemrefF16(%gpu_out_cast) : (memref<*xf16>) -> ()
67+
// call @printMemrefF32(%out_host_cast) : (memref<*xf32>) -> ()
68+
// CHECK: [ALLCLOSE: TRUE]
69+
call @printAllcloseF16(%gpu_out_cast, %out_host_cast) : (memref<*xf16>, memref<*xf32>) -> ()
70+
71+
memref.dealloc %in : memref<16x16xf16>
72+
memref.dealloc %out : memref<1x16xf16>
73+
memref.dealloc %out_host : memref<16xf32>
74+
return
75+
}
76+
func.func private @printMemrefF16(memref<*xf16>) attributes {llvm.emit_c_interface}
77+
func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface}
78+
func.func private @printAllcloseF16(memref<*xf16>, memref<*xf32>) attributes {llvm.emit_c_interface}
79+
func.func private @printAllcloseF32(memref<*xf32>, memref<*xf32>) attributes {llvm.emit_c_interface}
80+
func.func private @fillResource1DRandomF16(memref<*xf16>, f32, f32, i1) attributes {llvm.emit_c_interface}
81+
}
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
// RUN: %python_executable %imex_runner --requires=mlir-levelzero-runtime,spirv-backend -i %s --pass-pipeline-file=%p/xegpu-to-llvm.pp \
2+
// RUN: --runner mlir-runner -e main \
3+
// RUN: --entry-point-result=void \
4+
// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%mlir_levelzero_runtime --filecheck
5+
module @gemm attributes {gpu.container_module} {
6+
gpu.module @kernel {
7+
gpu.func @reduce_broadcast(%in: memref<16x16xf16>, %c: memref<16x16xf16>) kernel attributes {intel_reqd_sub_group_size = 16 : i32} {
8+
%c0 = arith.constant 0 : index
9+
%c8 = arith.constant 8 : index
10+
%cst = arith.constant dense<0.0> : vector<16xf16>
11+
%in_tdesc = xegpu.create_nd_tdesc %in : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
12+
%c_tdesc0 = xegpu.create_nd_tdesc %c : memref<16x16xf16> -> !xegpu.tensor_desc<8x16xf16>
13+
%c_tdesc1 = xegpu.create_nd_tdesc %c : memref<16x16xf16> -> !xegpu.tensor_desc<8x16xf16>
14+
%in_val = xegpu.load_nd %in_tdesc[%c0, %c0] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
15+
%reduce = vector.multi_reduction <add>, %in_val, %cst [1] : vector<16x16xf16> to vector<16xf16>
16+
%reduce_1 = vector.shape_cast %reduce : vector<16xf16> to vector<16x1xf16>
17+
%broadcast = vector.broadcast %reduce_1 : vector<16x1xf16> to vector<16x16xf16>
18+
%broadcast0 = vector.extract_strided_slice %broadcast {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16>
19+
%broadcast1 = vector.extract_strided_slice %broadcast {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16>
20+
xegpu.store_nd %broadcast0, %c_tdesc0[%c0, %c0] : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16>
21+
xegpu.store_nd %broadcast1, %c_tdesc1[%c8, %c0] : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16>
22+
gpu.return
23+
}
24+
}
25+
26+
func.func @test(%in: memref<16x16xf16>, %c: memref<16x16xf16>) -> memref<16x16xf16> attributes {llvm.emit_c_interface} {
27+
%c1 = arith.constant 1 : index
28+
%c16 = arith.constant 16 : index
29+
%c32 = arith.constant 32 : index
30+
%memref_in = gpu.alloc () : memref<16x16xf16>
31+
gpu.memcpy %memref_in, %in : memref<16x16xf16>, memref<16x16xf16>
32+
%memref_out = gpu.alloc () : memref<16x16xf16>
33+
gpu.memcpy %memref_out, %c : memref<16x16xf16>, memref<16x16xf16>
34+
gpu.launch_func @kernel::@reduce_broadcast blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1) args(%memref_in : memref<16x16xf16>, %memref_out : memref<16x16xf16>)
35+
gpu.wait
36+
gpu.memcpy %c, %memref_out : memref<16x16xf16>, memref<16x16xf16>
37+
gpu.dealloc %memref_in : memref<16x16xf16>
38+
gpu.dealloc %memref_out : memref<16x16xf16>
39+
return %c : memref<16x16xf16>
40+
}
41+
42+
43+
func.func @main() attributes {llvm.emit_c_interface} {
44+
%c0 = arith.constant 0 : index
45+
%c1 = arith.constant 1 : index
46+
%c16 = arith.constant 16 : index
47+
%in = memref.alloc() : memref<16x16xf16>
48+
%out = memref.alloc() : memref<16x16xf16>
49+
%out_host = memref.alloc() : memref<16x16xf32>
50+
// Fill input with random values
51+
%in_cast = memref.cast %in : memref<16x16xf16> to memref<*xf16>
52+
%lower = arith.constant 0.0 : f32
53+
%upper = arith.constant 5.0 : f32
54+
%gen_int = arith.constant 1 : i1
55+
call @fillResource1DRandomF16(%in_cast, %lower, %upper, %gen_int) : (memref<*xf16>, f32, f32, i1) -> ()
56+
57+
// CPU version.
58+
%c0_f16 = arith.constant 0.0 : f16
59+
%cst = arith.constant dense<1.0> : vector<16xf16>
60+
scf.for %i = %c0 to %c16 step %c1 {
61+
%row = vector.transfer_read %in[%i, %c0], %c0_f16 : memref<16x16xf16>, vector<16xf16>
62+
%reduce = vector.reduction <add>, %row : vector<16xf16> into f16
63+
%broadcast = vector.broadcast %reduce : f16 to vector<16xf16>
64+
%broadcast_f32 = arith.extf %broadcast : vector<16xf16> to vector<16xf32>
65+
vector.transfer_write %broadcast_f32, %out_host[%i, %c0] : vector<16xf32>, memref<16x16xf32>
66+
}
67+
%out_host_cast = memref.cast %out_host : memref<16x16xf32> to memref<*xf32>
68+
// GPU version.
69+
%gpu_out = call @test(%in, %out) : (memref<16x16xf16>, memref<16x16xf16>) -> memref<16x16xf16>
70+
%gpu_out_cast = memref.cast %gpu_out : memref<16x16xf16> to memref<*xf16>
71+
72+
// call @printMemrefF16(%gpu_out_cast) : (memref<*xf16>) -> ()
73+
// call @printMemrefF32(%out_host_cast) : (memref<*xf32>) -> ()
74+
// CHECK: [ALLCLOSE: TRUE]
75+
call @printAllcloseF16(%gpu_out_cast, %out_host_cast) : (memref<*xf16>, memref<*xf32>) -> ()
76+
77+
memref.dealloc %in : memref<16x16xf16>
78+
memref.dealloc %out : memref<16x16xf16>
79+
memref.dealloc %out_host : memref<16x16xf32>
80+
return
81+
}
82+
func.func private @printMemrefF16(memref<*xf16>) attributes {llvm.emit_c_interface}
83+
func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface}
84+
func.func private @printAllcloseF16(memref<*xf16>, memref<*xf32>) attributes {llvm.emit_c_interface}
85+
func.func private @printAllcloseF32(memref<*xf32>, memref<*xf32>) attributes {llvm.emit_c_interface}
86+
func.func private @fillResource1DRandomF16(memref<*xf16>, f32, f32, i1) attributes {llvm.emit_c_interface}
87+
}

0 commit comments

Comments
 (0)