|
| 1 | +// RUN: %python_executable %imex_runner --requires=mlir-levelzero-runtime,spirv-backend -i %s --pass-pipeline-file=%p/xegpu-to-llvm.pp \ |
| 2 | +// RUN: --runner mlir-runner -e main \ |
| 3 | +// RUN: --entry-point-result=void \ |
| 4 | +// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%mlir_levelzero_runtime --filecheck |
| 5 | +module @gemm attributes {gpu.container_module} { |
| 6 | + gpu.module @kernel { |
| 7 | + gpu.func @reduce_broadcast(%in: memref<16x16xf16>, %c: memref<16x16xf16>) kernel attributes {intel_reqd_sub_group_size = 16 : i32} { |
| 8 | + %c0 = arith.constant 0 : index |
| 9 | + %c8 = arith.constant 8 : index |
| 10 | + %cst = arith.constant dense<0.0> : vector<16xf16> |
| 11 | + %in_tdesc = xegpu.create_nd_tdesc %in : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> |
| 12 | + %c_tdesc0 = xegpu.create_nd_tdesc %c : memref<16x16xf16> -> !xegpu.tensor_desc<8x16xf16> |
| 13 | + %c_tdesc1 = xegpu.create_nd_tdesc %c : memref<16x16xf16> -> !xegpu.tensor_desc<8x16xf16> |
| 14 | + %in_val = xegpu.load_nd %in_tdesc[%c0, %c0] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> |
| 15 | + %reduce = vector.multi_reduction <add>, %in_val, %cst [1] : vector<16x16xf16> to vector<16xf16> |
| 16 | + %reduce_1 = vector.shape_cast %reduce : vector<16xf16> to vector<16x1xf16> |
| 17 | + %broadcast = vector.broadcast %reduce_1 : vector<16x1xf16> to vector<16x16xf16> |
| 18 | + %broadcast0 = vector.extract_strided_slice %broadcast {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16> |
| 19 | + %broadcast1 = vector.extract_strided_slice %broadcast {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16> |
| 20 | + xegpu.store_nd %broadcast0, %c_tdesc0[%c0, %c0] : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16> |
| 21 | + xegpu.store_nd %broadcast1, %c_tdesc1[%c8, %c0] : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16> |
| 22 | + gpu.return |
| 23 | + } |
| 24 | + } |
| 25 | + |
| 26 | + func.func @test(%in: memref<16x16xf16>, %c: memref<16x16xf16>) -> memref<16x16xf16> attributes {llvm.emit_c_interface} { |
| 27 | + %c1 = arith.constant 1 : index |
| 28 | + %c16 = arith.constant 16 : index |
| 29 | + %c32 = arith.constant 32 : index |
| 30 | + %memref_in = gpu.alloc () : memref<16x16xf16> |
| 31 | + gpu.memcpy %memref_in, %in : memref<16x16xf16>, memref<16x16xf16> |
| 32 | + %memref_out = gpu.alloc () : memref<16x16xf16> |
| 33 | + gpu.memcpy %memref_out, %c : memref<16x16xf16>, memref<16x16xf16> |
| 34 | + gpu.launch_func @kernel::@reduce_broadcast blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1) args(%memref_in : memref<16x16xf16>, %memref_out : memref<16x16xf16>) |
| 35 | + gpu.wait |
| 36 | + gpu.memcpy %c, %memref_out : memref<16x16xf16>, memref<16x16xf16> |
| 37 | + gpu.dealloc %memref_in : memref<16x16xf16> |
| 38 | + gpu.dealloc %memref_out : memref<16x16xf16> |
| 39 | + return %c : memref<16x16xf16> |
| 40 | + } |
| 41 | + |
| 42 | + |
| 43 | + func.func @main() attributes {llvm.emit_c_interface} { |
| 44 | + %c0 = arith.constant 0 : index |
| 45 | + %c1 = arith.constant 1 : index |
| 46 | + %c16 = arith.constant 16 : index |
| 47 | + %in = memref.alloc() : memref<16x16xf16> |
| 48 | + %out = memref.alloc() : memref<16x16xf16> |
| 49 | + %out_host = memref.alloc() : memref<16x16xf32> |
| 50 | + // Fill input with random values |
| 51 | + %in_cast = memref.cast %in : memref<16x16xf16> to memref<*xf16> |
| 52 | + %lower = arith.constant 0.0 : f32 |
| 53 | + %upper = arith.constant 5.0 : f32 |
| 54 | + %gen_int = arith.constant 1 : i1 |
| 55 | + call @fillResource1DRandomF16(%in_cast, %lower, %upper, %gen_int) : (memref<*xf16>, f32, f32, i1) -> () |
| 56 | + |
| 57 | + // CPU version. |
| 58 | + %c0_f16 = arith.constant 0.0 : f16 |
| 59 | + %cst = arith.constant dense<1.0> : vector<16xf16> |
| 60 | + scf.for %i = %c0 to %c16 step %c1 { |
| 61 | + %row = vector.transfer_read %in[%i, %c0], %c0_f16 : memref<16x16xf16>, vector<16xf16> |
| 62 | + %reduce = vector.reduction <add>, %row : vector<16xf16> into f16 |
| 63 | + %broadcast = vector.broadcast %reduce : f16 to vector<16xf16> |
| 64 | + %broadcast_f32 = arith.extf %broadcast : vector<16xf16> to vector<16xf32> |
| 65 | + vector.transfer_write %broadcast_f32, %out_host[%i, %c0] : vector<16xf32>, memref<16x16xf32> |
| 66 | + } |
| 67 | + %out_host_cast = memref.cast %out_host : memref<16x16xf32> to memref<*xf32> |
| 68 | + // GPU version. |
| 69 | + %gpu_out = call @test(%in, %out) : (memref<16x16xf16>, memref<16x16xf16>) -> memref<16x16xf16> |
| 70 | + %gpu_out_cast = memref.cast %gpu_out : memref<16x16xf16> to memref<*xf16> |
| 71 | + |
| 72 | + // call @printMemrefF16(%gpu_out_cast) : (memref<*xf16>) -> () |
| 73 | + // call @printMemrefF32(%out_host_cast) : (memref<*xf32>) -> () |
| 74 | + // CHECK: [ALLCLOSE: TRUE] |
| 75 | + call @printAllcloseF16(%gpu_out_cast, %out_host_cast) : (memref<*xf16>, memref<*xf32>) -> () |
| 76 | + |
| 77 | + memref.dealloc %in : memref<16x16xf16> |
| 78 | + memref.dealloc %out : memref<16x16xf16> |
| 79 | + memref.dealloc %out_host : memref<16x16xf32> |
| 80 | + return |
| 81 | + } |
| 82 | + func.func private @printMemrefF16(memref<*xf16>) attributes {llvm.emit_c_interface} |
| 83 | + func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface} |
| 84 | + func.func private @printAllcloseF16(memref<*xf16>, memref<*xf32>) attributes {llvm.emit_c_interface} |
| 85 | + func.func private @printAllcloseF32(memref<*xf32>, memref<*xf32>) attributes {llvm.emit_c_interface} |
| 86 | + func.func private @fillResource1DRandomF16(memref<*xf16>, f32, f32, i1) attributes {llvm.emit_c_interface} |
| 87 | +} |
0 commit comments