From d27772e6f1abd07abb91e062f3225fb1fc58b957 Mon Sep 17 00:00:00 2001 From: Abhishek Varma Date: Wed, 18 Sep 2024 18:48:22 +0530 Subject: [PATCH] [ObjectFifo] Create a pass to convert temporary alloc to amdaie.buffer (#783) -- This commit creates a pass `iree-amdaie-temporary-alloc-bufferization` to convert temporary alloc/buffers to amdaie.buffer ops. Signed-off-by: Abhishek Varma --- .../AMDAIETemporaryAllocBufferization.cpp | 98 +++++++++++++++++++ .../iree-amd-aie/Transforms/CMakeLists.txt | 1 + .../iree-amd-aie/Transforms/PassDetail.h | 1 + .../AMD-AIE/iree-amd-aie/Transforms/Passes.h | 3 + .../AMD-AIE/iree-amd-aie/Transforms/Passes.td | 6 ++ .../Transforms/test/CMakeLists.txt | 1 + .../test/temporary_alloc_bufferization.mlir | 88 +++++++++++++++++ 7 files changed, 198 insertions(+) create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETemporaryAllocBufferization.cpp create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/temporary_alloc_bufferization.mlir diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETemporaryAllocBufferization.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETemporaryAllocBufferization.cpp new file mode 100644 index 000000000..a664e6d57 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETemporaryAllocBufferization.cpp @@ -0,0 +1,98 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/IR/AMDAIEOps.h" +#include "iree-amd-aie/Transforms/Passes.h" + +#define DEBUG_TYPE "iree-amdaie-temporary-alloc-bufferization" + +namespace mlir::iree_compiler::AMDAIE { + +namespace { + +static std::optional createBufferForTemporaryAllocOp( + IRRewriter &rewriter, WorkgroupOp workgroupOp, memref::AllocOp allocOp, + CoreOp coreOp, unsigned index) { + OpBuilder::InsertionGuard g(rewriter); + TileOp tileOp = coreOp.getTileOp(); + // Reset rewriter's location to after last tile's declaration. + auto tiles = workgroupOp.getBody()->getOps(); + assert(!tiles.empty() && "no tiles in workgroupOp"); + rewriter.setInsertionPointAfter(*std::prev(tiles.end(), 1)); + auto bufferType = cast(allocOp.getType()); + auto bufferOp = rewriter.create( + rewriter.getUnknownLoc(), bufferType, tileOp, nullptr); + return bufferOp; +} + +static LogicalResult bufferizeTemporaryAllocInCoreOp( + IRRewriter &rewriter, WorkgroupOp workgroupOp, CoreOp coreOp, + SmallVector &toBeErased) { + // Step 1. Get all buffers within a CoreOp. + SmallVector allocOps; + coreOp.walk([&](Operation *op) { + if (auto allocOp = dyn_cast(op)) { + allocOps.push_back(allocOp); + toBeErased.push_back(allocOp); + } else if (auto deallocOp = dyn_cast(op)) { + toBeErased.push_back(deallocOp); + } + }); + // Bail out early in case of no temporary buffers. + if (allocOps.size() == 0) return success(); + // Step 2. Traverse unique allocOps and create an aie.buffer for them. + SmallVector temporaryBuffers; + unsigned tempBufferIndex = 0; + for (memref::AllocOp allocOp : allocOps) { + std::optional temporaryBuffer = createBufferForTemporaryAllocOp( + rewriter, workgroupOp, allocOp, coreOp, tempBufferIndex++); + if (!temporaryBuffer) { + return failure(); + } + allocOp.replaceAllUsesWith(temporaryBuffer.value().getResult()); + } + return success(); +} + +class AMDAIETemporaryAllocBufferizationPass + : public impl::AMDAIETemporaryAllocBufferizationBase< + AMDAIETemporaryAllocBufferizationPass> { + public: + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + + void runOnOperation() override; +}; + +void AMDAIETemporaryAllocBufferizationPass::runOnOperation() { + Operation *parentOp = getOperation(); + IRRewriter rewriter(&getContext()); + + SmallVector toBeErased; + WalkResult res = parentOp->walk([&](WorkgroupOp workgroupOp) { + for (CoreOp coreOp : workgroupOp.getOps()) { + if (failed(bufferizeTemporaryAllocInCoreOp(rewriter, workgroupOp, coreOp, + toBeErased))) + return WalkResult::interrupt(); + } + return WalkResult::advance(); + }); + if (res.wasInterrupted()) return signalPassFailure(); + + for (Operation *op : toBeErased) { + op->dropAllUses(); + rewriter.eraseOp(op); + } +} + +} // namespace + +std::unique_ptr createAMDAIETemporaryAllocBufferizationPass() { + return std::make_unique(); +} + +} // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt index 4cdc7b48c..002a9bcec 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt @@ -92,6 +92,7 @@ iree_cc_library( "AMDAIERemoveMemorySpace.cpp" "AMDAIESinkIntoCore.cpp" "AMDAIESplitLogicalObjFifosForConnectionReuse.cpp" + "AMDAIETemporaryAllocBufferization.cpp" "AMDAIETile.cpp" "AMDAIETileAndFuse.cpp" "AMDAIEUtils.cpp" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h index 903e51229..1e0ba9bfa 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h @@ -71,6 +71,7 @@ namespace mlir::iree_compiler::AMDAIE { #define GEN_PASS_DEF_AMDAIEREMOVEMEMORYSPACE #define GEN_PASS_DEF_AMDAIESINKINTOCORE #define GEN_PASS_DEF_AMDAIESPLITLOGICALOBJFIFOSFORCONNECTIONREUSE +#define GEN_PASS_DEF_AMDAIETEMPORARYALLOCBUFFERIZATION #define GEN_PASS_DEF_AMDAIETILE #define GEN_PASS_DEF_AMDAIETILEANDFUSE #define GEN_PASS_DEF_AMDAIEVECTORIZATION diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h index a5e796c2c..c3867d009 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h @@ -233,6 +233,9 @@ std::unique_ptr createAMDAIESinkIntoCorePass(); /// Create a pass to split logicalobjectfifos for connection reuse. std::unique_ptr createAMDAIESplitLogicalObjFifosForConnectionReusePass(); +/// Create a pass to bufferize temporary alloc ops. +std::unique_ptr createAMDAIETemporaryAllocBufferizationPass(); + /// Create pass to tile TilingInterface operations. std::unique_ptr createAMDAIETilePass(AMDAIETileOptions options = {}); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td index 10704eb42..339ed9651 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td @@ -506,6 +506,12 @@ def AMDAIESplitLogicalObjFifosForConnectionReuse : let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIESplitLogicalObjFifosForConnectionReusePass()"; } +def AMDAIETemporaryAllocBufferization : + Pass<"iree-amdaie-temporary-alloc-bufferization", ""> { + let summary = "Bufferizes temporary alloc buffers into `amdaie.buffer` ops."; + let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIETemporaryAllocBufferizationPass()"; +} + def AMDAIETile : InterfacePass<"iree-amdaie-tile", "mlir::FunctionOpInterface"> { let summary = "Pass to tile TilingInterface operations."; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt index fbb2ae8d6..61071df29 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt @@ -64,6 +64,7 @@ iree_lit_test_suite( "remove_memory_space.mlir" "sink_into_core.mlir" "split_logicalobjfifos_for_connection_reuse.mlir" + "temporary_alloc_bufferization.mlir" "tile_and_fuse_using_scf_for.mlir" "tile_and_fuse_matmul_using_scf_forall.mlir" "tile_and_fuse_convolution_using_scf_forall.mlir" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/temporary_alloc_bufferization.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/temporary_alloc_bufferization.mlir new file mode 100644 index 000000000..8743c00f0 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/temporary_alloc_bufferization.mlir @@ -0,0 +1,88 @@ +// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(iree-amdaie-temporary-alloc-bufferization)" --verify-diagnostics %s | FileCheck %s + +// CHECK-LABEL: @temp_buffer +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index +// CHECK-DAG: %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]]) +// CHECK-DAG: %[[TILE_0_3:.*]] = amdaie.tile(%[[C0]], %[[C3]]) +// CHECK-DAG: %[[TILE_1_2:.*]] = amdaie.tile(%[[C1]], %[[C2]]) +// CHECK-DAG: %[[BUFFER_1_2_0:.*]] = amdaie.buffer(%[[TILE_1_2]]) : memref<1024xf32, 2 : i32> +// CHECK-DAG: %[[BUFFER_1_2_1:.*]] = amdaie.buffer(%[[TILE_1_2]]) : memref<1024xf32, 2 : i32> +// CHECK-DAG: %[[BUFFER_0_3_0:.*]] = amdaie.buffer(%[[TILE_0_3]]) : memref<1024xf32, 2 : i32> +// CHECK-DAG: %[[BUFFER_0_3_1:.*]] = amdaie.buffer(%[[TILE_0_3]]) : memref<1024xf32, 2 : i32> +// CHECK-DAG: %[[BUFFER_0_2:.*]] = amdaie.buffer(%[[TILE_0_2]]) : memref<1024xf32, 2 : i32> +// CHECK: amdaie.core(%[[TILE_0_2]] +// CHECK-NOT: memref.alloc +// CHECK: %[[CAST:.*]] = memref.reinterpret_cast %[[BUFFER_0_2]] +// CHECK: linalg.fill ins(%{{.*}}) outs(%[[CAST]] +// CHECK-NOT: dealloc +// CHECK: amdaie.end +// CHECK: amdaie.core(%[[TILE_0_3]] +// CHECK-NOT: memref.alloc +// CHECK: %[[CAST:.*]] = memref.reinterpret_cast %[[BUFFER_0_3_1]] +// CHECK: linalg.fill ins(%{{.*}}) outs(%[[CAST]] +// CHECK-NOT: dealloc +// CHECK-NOT: memref.alloc +// CHECK: %[[CAST_1:.*]] = memref.reinterpret_cast %[[BUFFER_0_3_0]] +// CHECK: linalg.fill ins(%{{.*}}) outs(%[[CAST_1]] +// CHECK-NOT: dealloc +// CHECK: amdaie.end +// CHECK: amdaie.core(%[[TILE_1_2]] +// CHECK-NOT: memref.alloc +// CHECK-NOT: memref.alloc +// CHECK: %[[CAST:.*]] = memref.reinterpret_cast %[[BUFFER_1_2_1]] +// CHECK: %[[CAST_1:.*]] = memref.reinterpret_cast %[[BUFFER_1_2_0]] +// CHECK: linalg.fill ins(%{{.*}}) outs(%[[CAST]] +// CHECK: linalg.fill ins(%{{.*}}) outs(%[[CAST_1]] +// CHECK-NOT: dealloc +// CHECK-NOT: dealloc +// CHECK: amdaie.end +func.func @temp_buffer() { + amdaie.workgroup { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index + %tile_0_2 = amdaie.tile(%c0, %c2) + %tile_0_3 = amdaie.tile(%c0, %c3) + %tile_1_2 = amdaie.tile(%c1, %c2) + %core_0_2 = amdaie.core(%tile_0_2, in : [], out : []) { + %cst_0 = arith.constant 0.000000e+00 : f32 + %alloc = memref.alloc() : memref<1024xf32, 2 : i32> + %reinterpret_cast = memref.reinterpret_cast %alloc to offset: [0], sizes: [1, 1, 8, 8, 4, 4], strides: [1024, 1024, 128, 16, 4, 1] : memref<1024xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, 2 : i32> + linalg.fill ins(%cst_0 : f32) outs(%reinterpret_cast : memref<1x1x8x8x4x4xf32, 2 : i32>) + memref.dealloc %alloc : memref<1024xf32, 2 : i32> + amdaie.end + } + %core_0_3 = amdaie.core(%tile_0_3, in : [], out : []) { + %cst_0 = arith.constant 0.000000e+00 : f32 + %alloc = memref.alloc() : memref<1024xf32, 2 : i32> + %reinterpret_cast = memref.reinterpret_cast %alloc to offset: [0], sizes: [1, 1, 8, 8, 4, 4], strides: [1024, 1024, 128, 16, 4, 1] : memref<1024xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, 2 : i32> + linalg.fill ins(%cst_0 : f32) outs(%reinterpret_cast : memref<1x1x8x8x4x4xf32, 2 : i32>) + memref.dealloc %alloc : memref<1024xf32, 2 : i32> + %alloc_1 = memref.alloc() : memref<1024xf32, 2 : i32> + %reinterpret_cast_1 = memref.reinterpret_cast %alloc_1 to offset: [0], sizes: [1, 1, 8, 8, 4, 4], strides: [1024, 1024, 128, 16, 4, 1] : memref<1024xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, 2 : i32> + linalg.fill ins(%cst_0 : f32) outs(%reinterpret_cast_1 : memref<1x1x8x8x4x4xf32, 2 : i32>) + memref.dealloc %alloc_1 : memref<1024xf32, 2 : i32> + amdaie.end + } + %core_1_2 = amdaie.core(%tile_1_2, in : [], out : []) { + %cst_0 = arith.constant 0.000000e+00 : f32 + %alloc = memref.alloc() : memref<1024xf32, 2 : i32> + %alloc_1 = memref.alloc() : memref<1024xf32, 2 : i32> + %reinterpret_cast = memref.reinterpret_cast %alloc to offset: [0], sizes: [1, 1, 8, 8, 4, 4], strides: [1024, 1024, 128, 16, 4, 1] : memref<1024xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, 2 : i32> + %reinterpret_cast_1 = memref.reinterpret_cast %alloc_1 to offset: [0], sizes: [1, 1, 8, 8, 4, 4], strides: [1024, 1024, 128, 16, 4, 1] : memref<1024xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, 2 : i32> + linalg.fill ins(%cst_0 : f32) outs(%reinterpret_cast : memref<1x1x8x8x4x4xf32, 2 : i32>) + linalg.fill ins(%cst_0 : f32) outs(%reinterpret_cast_1 : memref<1x1x8x8x4x4xf32, 2 : i32>) + memref.dealloc %alloc : memref<1024xf32, 2 : i32> + memref.dealloc %alloc_1 : memref<1024xf32, 2 : i32> + amdaie.end + } + amdaie.controlcode { + amdaie.end + } + } + return +}