squash

nod-ai · Aug 28, 2024 · ade1af2 · ade1af2
1 parent a5fbf9d
commit ade1af2
Show file tree

Hide file tree

Showing 2 changed files with 109 additions and 16 deletions.
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp
@@ -6,6 +6,7 @@
 
 #include "iree-amd-aie/Transforms/Passes.h"
 #include "iree/compiler/Codegen/Utils/Utils.h"
+#include "llvm/ADT/StringExtras.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h"
@@ -16,16 +17,97 @@
 #include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h"
 #include "mlir/Dialect/SCF/Transforms/Transforms.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
 #include "mlir/IR/Iterators.h"
 #include "mlir/IR/PatternMatch.h"
 
 #define DEBUG_TYPE "iree-amdaie-tile-and-fuse"
 
-
 namespace mlir::iree_compiler::AMDAIE {
 
 namespace {
 
+enum class GPUMappingType { Block, Thread };
+FailureOr<SmallVector<Attribute>> getGPUMappingAttributes(
+    ArrayRef<int64_t> tileSizesVal, GPUMappingType mappingType,
+    TilingInterface op) {
+  MLIRContext *context = op.getContext();
+
+  // There is one induction variables in the scf.forall for each of the
+  // non-zero tile sizes. Recall that a '0' tile size corresponds to 'do
+  // not tile'.
+  uint32_t nbIndVars = std::count_if(tileSizesVal.begin(), tileSizesVal.end(),
+                                     [](int64_t t) { return t != 0; });
+
+  // See mlir::gpu::MappingId enum. Currently 13 dimensions are supported.
+  // clang-format off
+  // https://github.com/llvm/llvm-project/blob/e8063702cfbbf39f0b92283d0588dee264b5eb2b/mlir/include/mlir/Dialect/GPU/IR/GPUDeviceMappingAttr.td#L37.
+  // clang-format on
+  if (nbIndVars > mlir::gpu::getMaxEnumValForMappingId()) {
+    return op->emitOpError("has too many dimensions to tile, ")
+           << "there are only " << mlir::gpu::getMaxEnumValForMappingId()
+           << " dimensions available in the mlir::gpu dialect (require "
+           << nbIndVars << " here).";
+  }
+
+
+  auto getMappingAttributeForDimension = [&](uint32_t i) -> Attribute {
+    auto id = static_cast<gpu::MappingId>(i);
+    if (mappingType == GPUMappingType::Block)
+      return gpu::GPUBlockMappingAttr::get(context, id);
+    else if (mappingType == GPUMappingType::Thread)
+      return gpu::GPUThreadMappingAttr::get(context, id);
+    else {
+      assert(false && "unhandled mapping type");
+    }
+  };
+
+  SmallVector<Attribute> mapping;
+  mapping.reserve(nbIndVars);
+  for (uint32_t i = 0; i < nbIndVars; ++i) {
+    // DimY and DimX are swapped, so that DimX goes to AIE array columns and
+    // DimX goes to AIE arrat rows (or something like that...)
+    if (i == 0)
+      mapping.push_back(getMappingAttributeForDimension(1));
+    else if (i == 1)
+      mapping.push_back(getMappingAttributeForDimension(0));
+    else
+      mapping.push_back(getMappingAttributeForDimension(i));
+  }
+
+  // Currently we expect only the first 2 tiled dimensions to be more than 1.
+  // TODO(newling) if there are 3+ dimensions, we need to collapse them,
+  // a follow-up pass. Instead of
+  //   (i,j,k) in (2,3,5)
+  // for example, could be
+  //   (i,l) in (2,15)
+  // with then
+  //   j=l/5 and k=l%5.
+  // Once the above is implemented, we can safely remove the following check.
+  //
+  // Example of failure (0,2,3,1,4) : the 4 is not one of the first 2 non-zeros
+  // Example of failure (0,1,1,2)   : the 2 is not one of the first 2 non-zeros
+  // Example of success (0,2,3,1,1) : there are no entires above 1 after 3.
+  if (nbIndVars > 2) {
+    uint32_t indVarCount = 0;
+    for (auto tileSize : tileSizesVal) {
+      if (indVarCount >= 2 && tileSize > 1) {
+        return op->emitOpError("has requested tile sizes (")
+               << llvm::join_items(tileSizesVal, ",")
+               << "). Currently we only support tiling where the first 2 "
+                  "non-zero tile sizes are greater than 1. We must implement "
+                  "a pass to merge tiling dimensions to be able to lower to "
+                  "the 2-D AIE array. ";
+      }
+      if (tileSize > 0) {
+        ++indVarCount;
+      }
+    }
+  }
+
+  return mapping;
+}
+
 /// Utility function to check if any of the reduction dimension is being tiled.
 static bool isTilingReductionDimension(TilingInterface consumerOp,
                                        SmallVector<int64_t> tileSizesVal) {
@@ -157,27 +239,20 @@ void AMDAIETileAndFusePass::runOnOperation() {
 
   SmallVector<OpFoldResult> tileSizes =
       getAsIndexOpFoldResult(context, tileSizesVal);
+
   auto options = scf::SCFTilingOptions().setTileSizes(tileSizes);
 
   // When tiling using scf.for we do not need to set any mapping.
   if (!useSCFFor) {
     options.setLoopType(scf::SCFTilingOptions::LoopType::ForallOp);
-    // Here we assume there are always two levels of parallel (scf.forall)
-    // loops, and the first level of tiling is always using scf.forall and
-    // mapped to blocks. Currently we are not using mapping attributes for
-    // Conv2d ops, because there could be four parallel tiling dimensions.
-    // TODO (vivian): create AIE specific mapping attributes.
-    if (!isa<linalg::ConvolutionOpInterface>(consumerOp.getOperation())) {
-      if (tilingLevel == 0) {
-        options.setMapping(
-            {gpu::GPUBlockMappingAttr::get(context, gpu::MappingId::DimY),
-             gpu::GPUBlockMappingAttr::get(context, gpu::MappingId::DimX)});
-      } else {
-        options.setMapping(
-            {gpu::GPUThreadMappingAttr::get(context, gpu::MappingId::DimY),
-             gpu::GPUThreadMappingAttr::get(context, gpu::MappingId::DimX)});
-      }
+    auto maybeMapping = getGPUMappingAttributes(
+        tileSizesVal,
+        tilingLevel == 0 ? GPUMappingType::Block : GPUMappingType::Thread,
+        consumerOp);
+    if (failed(maybeMapping)) {
+      return signalPassFailure();
     }
+    options.setMapping(maybeMapping.value());
   }
 
   IRRewriter rewriter(context);

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
@@ -634,6 +634,24 @@ void addMLIRAIRLoweringPasses(OpPassManager &passManager, AMDAIEDevice device) {
   passManager.addPass(createEraseHALDescriptorTypeFromMemRefPass());
   passManager.addPass(memref::createFoldMemRefAliasOpsPass());
   passManager.addPass(createAMDAIEBridgeToAIRPass());
+
+  // Running canonicalization for all pipelines here results in failures.
+  // Example
+  // ```
+  // 'memref.cast' op is an unsupported operation. This pass currently only
+  // supports AllocOp and SubViewOp as inputs.
+  // ```
+  // It is currently required for the convolution pipeline though, to remove the
+  // extra (size-1) thread- and group- dimensions.
+  //
+  // TODO(newling) there are better solutions like:
+  // 1) make canonicalization work for scf.forall
+  // 2) pass to collapse rank-4 scf.foralls to rank-2 scf.foralls.
+  // 3) resolve above 'unsupproted operation' error.
+  if (clUseTilePipeline == TilePassPipeline::ConvDecomposePipeline) {
+    passManager.addPass(createCanonicalizerPass());
+  }
+
   // TODO (Erwei): Figure out a way to work with AMDAIEPackToDmaPass.
   if (clUseTilePipeline == TilePassPipeline::PackPeelPipeline)
     passManager.addPass(createAMDAIEDecomposeLinalgExtPackUnPackToAIRPass());