Check isIntOrFloat before querying bitwidth (#19172)

The element type must be an int or float before using `getIntOrFloatBitWidth`. This adds a check if the element type is an int or float, otherwise don't adjust innermost tile size. Introduced by commit c80fa3b 1/2 fix for #19167. --------- Signed-off-by: Ian Wood <[email protected]> Co-authored-by: giacs-epic <[email protected]>
iree-org · Nov 19, 2024 · b68c535 · b68c535
1 parent 540cebf
commit b68c535
Show file tree

Hide file tree

Showing 2 changed files with 57 additions and 4 deletions.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -2922,10 +2922,14 @@ setLoweringConfigForComputeOps(mlir::FunctionOpInterface entryPointFn,
   // loads and stores will have a performance impact.
   auto resultTypes = rootOperation->getResultTypes();
   if (commonVecTileSizes.size() != 0 && !resultTypes.empty()) {
-    auto elementTypeSize =
-        cast<ShapedType>(rootOperation->getResultTypes().front())
-            .getElementType()
-            .getIntOrFloatBitWidth();
+    Type elementType = cast<ShapedType>(resultTypes[0]).getElementType();
+    unsigned int elementTypeSize;
+    if (auto complexType = llvm::dyn_cast<ComplexType>(elementType)) {
+      elementTypeSize =
+          2 * complexType.getElementType().getIntOrFloatBitWidth();
+    } else {
+      elementTypeSize = elementType.getIntOrFloatBitWidth();
+    }
     // for now just enable for i1
     if (elementTypeSize == 1) {
       auto innermostTileSize = commonVecTileSizes.back();

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir
@@ -1983,3 +1983,52 @@ func.func @i1_type()  attributes {hal.executable.target = #executable_target_emb
 // CHECK: func @i1_type()
 // CHECK: linalg.generic {
 // CHECK-SAME: {lowering_config = #[[CONFIG]]}
+
+// -----
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>
+#map = affine_map<(d0, d1, d2) -> (d1)>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+func.func @complex_view_as_real() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1xi32>>
+  %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x50xcomplex<f32>>>
+  %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1x32x50x2xf32>>
+  %3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(3) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<32x50x2xf32>>
+  %4 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [1], strides = [1] : !flow.dispatch.tensor<readonly:tensor<1xi32>> -> tensor<1xi32>
+  %5 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0, 0], sizes = [1, 1, 32, 50, 2], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1x32x50x2xf32>> -> tensor<1x1x32x50x2xf32>
+  %6 = tensor.empty() : tensor<32x50x2xf32>
+  %extracted = tensor.extract %4[%c0] : tensor<1xi32>
+  %7 = arith.extsi %extracted : i32 to i64
+  %8 = arith.index_cast %7 : i64 to index
+  %9 = flow.dispatch.tensor.load %1, offsets = [%8, 0], sizes = [1, 50], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x50xcomplex<f32>>> -> tensor<50xcomplex<f32>>
+  %10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%9 : tensor<50xcomplex<f32>>) outs(%6 : tensor<32x50x2xf32>) {
+  ^bb0(%in: complex<f32>, %out: f32):
+    %11 = linalg.index 0 : index
+    %12 = linalg.index 1 : index
+    %extracted_0 = tensor.extract %5[%c0, %c0, %11, %12, %c0] : tensor<1x1x32x50x2xf32>
+    %extracted_1 = tensor.extract %5[%c0, %c0, %11, %12, %c1] : tensor<1x1x32x50x2xf32>
+    %13 = complex.create %extracted_0, %extracted_1 : complex<f32>
+    %14 = complex.mul %13, %in : complex<f32>
+    %15 = complex.re %14 : complex<f32>
+    %16 = complex.im %14 : complex<f32>
+    %17 = linalg.index 2 : index
+    %18 = arith.cmpi eq, %17, %c0 : index
+    %19 = arith.select %18, %15, %16 : f32
+    linalg.yield %19 : f32
+  } -> tensor<32x50x2xf32>
+  flow.dispatch.tensor.store %10, %3, offsets = [0, 0, 0], sizes = [32, 50, 2], strides = [1, 1, 1] : tensor<32x50x2xf32> -> !flow.dispatch.tensor<writeonly:tensor<32x50x2xf32>>
+  return
+}
+
+//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[4, 25, 2], [1, 1, 2], [0, 0, 0], [0, 0, 0]{{\]}}>
+//      CHECK: func.func @complex_view_as_real()
+//      CHECK:   linalg.generic
+// CHECK-SAME:       lowering_config = #[[CONFIG]]