Skip to content

Commit

Permalink
Assign correct tiles to reusable L1 buffer
Browse files Browse the repository at this point in the history
  • Loading branch information
Abhishek-Varma committed Sep 11, 2024
1 parent d30ec7f commit 4f0caea
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 36 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -834,18 +834,18 @@ LogicalResult combineLogicalObjectFifos(
// will make an attempt to combine the logical objectFifos as per the
// following algorithm :-
// a. Combine i-th and i+1-th L3->L2 DmaCpyNd ops.
// b. Since step a would create a new L2 buffer (with combined shape), we
// will
// need to update the corresponding two L2->L1 Dma ops by indeed creating
// new ones. NOTE: Both of these new L2->L1 Dma ops will be reusing the
// same L1 buffers as well.
// c. Now pick the unique core ops corresponding to i-th and i+1-th L2->L1
// Dma
// ops and do the following :-
// b. Form reusable L1 buffer by assigning the cumulative tiles of the
// intended core ops.
// c. Since step a would create a new L2 buffer (with combined shape), we
// will need to update the corresponding two L2->L1 Dma ops by indeed
// creating new ones. NOTE: Both of these new L2->L1 Dma ops will be
// reusing the same L1 buffers as well.
// d. Now pick the unique core ops corresponding to i-th and i+1-th L2->L1
// Dma ops and do the following :-
// 1. For i-th CoreOp insert an AccessOp from the same L1 buffer towards
// the end.
// the end.
// 2. For i+1-th CoreOp insert an AccessOp from the same L1 buffer right
// before the corresponding AccessOp within the same CoreOp.
// before the corresponding AccessOp within the same CoreOp.
for (unsigned i = 0, n = l3ToL2DmaOps.size(); i < n; i += 2) {
// Step 1. Combine the picked L3->L2 DmaCpyNd pair.
FailureOr<LogicalObjectFifoFromMemrefOp> maybeNewL2ObjectFifo =
Expand All @@ -855,14 +855,56 @@ LogicalResult combineLogicalObjectFifos(
LogicalObjectFifoFromMemrefOp newL2ObjectFifo =
maybeNewL2ObjectFifo.value();

// Step 2. We now have need to create two L2->L1 ops since the size has
// Step 2. Form the reusable L1 buffer by assigning the cumulative tiles of
// the intended core ops.
LogicalObjectFifoFromMemrefOp reuseL1LogicalObjectFifoOp =
l2ToL1DmaOps[i].getTargetObjectFifo();
SmallVector<Value> tiles;
auto addNewTileFrom = [&](CoreOp coreOp) -> LogicalResult {
OpBuilder::InsertionGuard guard(rewriter);
TileOp tileOp = coreOp.getTileOp();
std::optional<int64_t> column = getConstantIntValue(tileOp.getCol());
std::optional<int64_t> row = getConstantIntValue(tileOp.getRow());
if (!column || !row) {
return coreOp.emitOpError() << "has non-constant tile location";
}
rewriter.setInsertionPoint(reuseL1LogicalObjectFifoOp);
auto colIndex = rewriter.create<arith::ConstantIndexOp>(
rewriter.getUnknownLoc(), *column);
auto rowIndex = rewriter.create<arith::ConstantIndexOp>(
rewriter.getUnknownLoc(), *row);
tileOp =
rewriter.create<TileOp>(rewriter.getUnknownLoc(), colIndex, rowIndex);
tiles.push_back(tileOp.getResult());
return success();
};
std::optional<CoreOp> maybeFirstCoreOp = fetchUniqueCoreOp(l2ToL1DmaOps[i]);
if (!maybeFirstCoreOp) return failure();
CoreOp firstCoreOp = maybeFirstCoreOp.value();
std::optional<CoreOp> maybeSecondCoreOp =
fetchUniqueCoreOp(l2ToL1DmaOps[i + 1]);
if (!maybeSecondCoreOp) return failure();
CoreOp secondCoreOp = maybeSecondCoreOp.value();
if (failed(addNewTileFrom(firstCoreOp)) ||
failed(addNewTileFrom(secondCoreOp))) {
return failure();
}
llvm::sort(tiles.begin(), tiles.end(),
AMDAIE::TileOp::tileValueColumnAndRowComparator);
rewriter.setInsertionPoint(reuseL1LogicalObjectFifoOp);
reuseL1LogicalObjectFifoOp =
rewriter.replaceOpWithNewOp<LogicalObjectFifoFromMemrefOp>(
reuseL1LogicalObjectFifoOp,
cast<LogicalObjectFifoType>(
reuseL1LogicalObjectFifoOp.getOutput().getType()),
reuseL1LogicalObjectFifoOp.getMemref(), tiles);

// Step 3. We now have need to create two L2->L1 ops since the size has
// changed. But for this we first need to find the new offset for L2 as
// source.
// TODO: For now I'm hardcoding the offsets but later it'd just depend
// on combining/non-combining dimensions.
// Offset = 0,0
LogicalObjectFifoFromMemrefOp reuseL1LogicalObjectFifoOp =
l2ToL1DmaOps[i].getTargetObjectFifo();
SmallVector<OpFoldResult> newL2AsSourceOffsets =
l2ToL1DmaOps[i].getSourceMixedOffsets();
DmaCpyNdOp newFirstL2ToL1DmaOp = createL2ToL1ForReuse(
Expand All @@ -872,31 +914,24 @@ LogicalResult combineLogicalObjectFifos(
// the first L2->L1 Dma.
newL2AsSourceOffsets = l2ToL1DmaOps[i + 1].getSourceMixedOffsets();
newL2AsSourceOffsets[1] = rewriter.getIndexAttr(1);
DmaCpyNdOp newSecondL2ToL1DmaOp = createL2ToL1ForReuse(
rewriter, l2ToL1DmaOps[i + 1], reuseL1LogicalObjectFifoOp,
newL2ObjectFifo, newL2AsSourceOffsets);
createL2ToL1ForReuse(rewriter, l2ToL1DmaOps[i + 1],
reuseL1LogicalObjectFifoOp, newL2ObjectFifo,
newL2AsSourceOffsets);

// Step 3. PICK the CoreOps associated with the 1:1 L2->L1.
// Step 4. Pick the CoreOps associated with the 1:1 L2->L1.
// For the first Core op we'll insert Read at the end. It doesn't matter
// for now so we're gonna insert it right before amdaie.end op.
std::optional<CoreOp> maybeFirstCoreOp =
fetchUniqueCoreOp(newFirstL2ToL1DmaOp);
if (!maybeFirstCoreOp) return failure();
CoreOp firstCoreOp = maybeFirstCoreOp.value();
firstCoreOp.walk([&](AMDAIE::EndOp endOp) {
OpBuilder::InsertionGuard guard(rewriter);
// Hardcoding to `AMDAIE::MemoryAccess::Read`.
rewriter.setInsertionPoint(endOp);
rewriter.create<AMDAIE::LogicalObjectFifoAccessOp>(
rewriter.getUnknownLoc(), reuseL1LogicalObjectFifoOp.getOutput(),
AMDAIE::MemoryAccess::Read);
firstCoreOp.walk([&](AMDAIE::LogicalObjectFifoAccessOp accessOp) {
if (accessOp.getInput() == newFirstL2ToL1DmaOp.getTargetObjectFifo()) {
OpBuilder::InsertionGuard guard(rewriter);
rewriter.setInsertionPointAfter(accessOp);
rewriter.create<AMDAIE::LogicalObjectFifoAccessOp>(
rewriter.getUnknownLoc(), reuseL1LogicalObjectFifoOp.getOutput(),
accessOp.getAccessType());
}
});
// For the second Core op we'll insert `Read` right before the first read
// from the corresponding L1 logicalobjectFifo.
std::optional<CoreOp> maybeSecondCoreOp =
fetchUniqueCoreOp(newSecondL2ToL1DmaOp);
if (!maybeSecondCoreOp) return failure();
CoreOp secondCoreOp = maybeSecondCoreOp.value();
secondCoreOp.walk([&](AMDAIE::LogicalObjectFifoAccessOp accessOp) {
if (accessOp.getInput() == l2ToL1DmaOps[i + 1].getTargetObjectFifo()) {
OpBuilder::InsertionGuard guard(rewriter);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,16 @@
// CHECK: %[[DMA_CPY_ND_L3_TO_L2_1:.*]] = amdaie.dma_cpy_nd(
// CHECK-SAME: %[[L2_OBJECTFIFO_1]][0, 0, 0, 0] [1, 2, 32, 32] [2048, 1024, 32, 1]
// CHECK-SAME: %[[L3_OBJECTFIFO]][0, 0, %[[IV0_32]], %[[IV1_0]]] [1, 2, 32, 32] [4096, 32, 128, 1]
// CHECK: %[[L1_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_1]]}
// CHECK: %[[L1_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_1]], %[[TILE_0]]}
// CHECK: %[[DMA_CPY_ND_L2_TO_L1_0:.*]] = amdaie.dma_cpy_nd(
// CHECK-SAME: %[[L1_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1]
// CHECK-SAME: %[[L2_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
// CHECK: amdaie.core(%[[TILE_1]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_0]]], out :
// CHECK: linalg.generic
// CHECK: %[[FIRST_READ:.*]] = amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_0]], Read)
// CHECK: amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_0]], Read)
// CHECK: linalg.generic
// CHECK-SAME: %[[FIRST_READ]]
// CHECK: amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_0]], Read)
// CHECK: amdaie.end
// CHECK: }
// CHECK: %[[DMA_CPY_ND_L2_TO_L1_1:.*]] = amdaie.dma_cpy_nd(
Expand All @@ -55,16 +55,16 @@
// CHECK-SAME: %[[SECOND_READ]]
// CHECK: amdaie.end
// CHECK: }
// CHECK: %[[L1_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_2]]}
// CHECK: %[[L1_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_3]], %[[TILE_2]]}
// CHECK: %[[DMA_CPY_ND_L2_TO_L1_2:.*]] = amdaie.dma_cpy_nd(
// CHECK-SAME: %[[L1_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1]
// CHECK-SAME: %[[L2_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
// CHECK: amdaie.core(%[[TILE_2]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_2]]], out :
// CHECK: linalg.generic
// CHECK: %[[FIRST_READ:.*]] = amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_1]], Read)
// CHECK: amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_1]], Read)
// CHECK: linalg.generic
// CHECK-SAME: %[[FIRST_READ]]
// CHECK: amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_1]], Read)
// CHECK: amdaie.end
// CHECK: }
// CHECK: %[[DMA_CPY_ND_L2_TO_L1_3:.*]] = amdaie.dma_cpy_nd(
Expand Down

0 comments on commit 4f0caea

Please sign in to comment.