Skip to content

Commit

Permalink
[CanonicalizeDoublyStridedOp] Make single dims explicit by default an…
Browse files Browse the repository at this point in the history
…d require this in LowerToAIE (#576)

NPU DMA operations with implicit addressing on L3 side behave
incorrectly on some cases and while the correct strides and sizes can be
derived from other places, this is quite indirect and easily leads to
bugs. For example, in the following case it can be derived from the
input `%circ_dma` that the explicit source access pattern is `[0] [1024]
[1]` or `[0, 0] [32, 32] [32, 1]` by inspecting at the **target** side
of that operation:

```
%circ_dma = amdaie.circular_dma_cpy_nd(%obj0[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo<memref<32x32xi32, 1>>, !amdaie.logicalobjectfifo<memref<1024x1024xi32>>)
...
%npu_dma = amdaie.npu.dma_cpy_nd %circ_dma ([] [] [], [] [] [] bd_id = %bd_id_0)
```

However, it would be more clear and simple to just keep the access
patterns explicit (when there is one) like this:

```
%circ_dma = amdaie.circular_dma_cpy_nd(%obj0[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo<memref<32x32xi32, 1>>, !amdaie.logicalobjectfifo<memref<1024x1024xi32>>)
...
%npu_dma = amdaie.npu.dma_cpy_nd %circ_dma ([] [] [], [0] [1024] [1] bd_id = %bd_id_0)
```
  • Loading branch information
jtuyls authored Jul 19, 2024
1 parent a77cb05 commit 408ed22
Show file tree
Hide file tree
Showing 7 changed files with 328 additions and 221 deletions.
18 changes: 18 additions & 0 deletions build_tools/ci/run_matmul_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -765,6 +765,24 @@ run_matmul_test \
--m "128" --k "32" --n "64" \
--num_repeat_runs "10"

run_matmul_test \
--name_prefix "small" \
--lower_to_aie_pipeline "objectFifo" \
--tile_pipeline "pack-peel" \
--lhs_rhs_type "i32" \
--acc_type "i32" \
--m "128" --k "32" --n "128" \
--num_repeat_runs "10"

run_matmul_test \
--name_prefix "small" \
--lower_to_aie_pipeline "objectFifo" \
--tile_pipeline "pack-peel" \
--lhs_rhs_type "i32" \
--acc_type "i32" \
--m "256" --k "32" --n "256" \
--num_repeat_runs "10"

run_matmul_test \
--name_prefix "small" \
--lower_to_aie_pipeline "objectFifo" \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,9 @@ class AMDAIECanonicalizeDoublyStridedOpPass
AMDAIECanonicalizeDoublyStridedOpPass() = default;
AMDAIECanonicalizeDoublyStridedOpPass(
const AMDAIECanonicalizeDoublyStridedOpPass &pass){};
AMDAIECanonicalizeDoublyStridedOpPass(
const AMDAIECanonicalizeDoublyStridedOpOptions &options)
: AMDAIECanonicalizeDoublyStridedOpBase(options) {}
void runOnOperation() override;
};

Expand All @@ -134,15 +137,17 @@ void AMDAIECanonicalizeDoublyStridedOpPass::runOnOperation() {
});

// Make DMA accesses with single dimension implicit.
parentOp->walk([&](AMDAIE::DoublyStridedOpInterface dmaOp) {
(void)foldDmaOpSingleDims(rewriter, dmaOp);
});
if (foldSingleDims) {
parentOp->walk([&](AMDAIE::DoublyStridedOpInterface dmaOp) {
(void)foldDmaOpSingleDims(rewriter, dmaOp);
});
}
}

} // namespace

std::unique_ptr<Pass> createAMDAIECanonicalizeDoublyStridedOpPass() {
return std::make_unique<AMDAIECanonicalizeDoublyStridedOpPass>();
std::unique_ptr<Pass> createAMDAIECanonicalizeDoublyStridedOpPass(AMDAIECanonicalizeDoublyStridedOpOptions options) {
return std::make_unique<AMDAIECanonicalizeDoublyStridedOpPass>(options);
}

} // namespace mlir::iree_compiler::AMDAIE
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,19 @@ namespace {
AIE::BDDimLayoutArrayAttr convertSizeStrideToBDDimLayoutArrayAttr(
IRRewriter &rewriter, const SmallVector<OpFoldResult> &sizes,
const SmallVector<OpFoldResult> &strides) {
assert(sizes.size() == strides.size() &&
"expected stride and size vectors of same size");
SmallVector<AIE::BDDimLayoutAttr, 4> bdDimLayoutAttr;
// If the access pattern (strides/sizes) have a single dimension, make it
// implicit with an empty `BDDimLayoutAttr` as this is what the AIE dialect
// expects.
if (strides.size() == 1) {
std::optional<int64_t> stride = getConstantIntValue(strides[0]);
if (stride && stride.value() == 1) {
return AIE::BDDimLayoutArrayAttr::get(rewriter.getContext(),
ArrayRef(bdDimLayoutAttr));
}
}
bdDimLayoutAttr.reserve(sizes.size());
for (auto [size, stride] : llvm::zip(sizes, strides)) {
bdDimLayoutAttr.push_back(AIE::BDDimLayoutAttr::get(
Expand Down Expand Up @@ -477,34 +489,6 @@ LogicalResult getStaticDimsForExplicitAddressing(
return success();
}

/// Utility to get the static sizes and strides for `AIEX::NpuDmaMemcpyNdOp`
/// with implicit addressing.
LogicalResult getStaticDimsForImplicitAddressing(
Operation *op, MemRefType memrefType, SmallVectorImpl<int64_t> &staticSizes,
SmallVectorImpl<int64_t> &staticStrides) {
assert((staticSizes.size() == staticStrides.size()) &&
"static size and strides should be of same size");
// 1. Static sizes.
SmallVector<int64_t> shapeArr = llvm::to_vector(memrefType.getShape());
// staticOffsets/staticSizes/staticStrides all have same size.
if (shapeArr.size() > staticSizes.size()) {
op->emitError() << "implicit source/target L3 memref has rank greater than "
"the expected static offsets/sizes/strides rank (4)";
return failure();
}
std::copy(shapeArr.begin(), shapeArr.end(),
staticSizes.end() - shapeArr.size());
// 2. Static strides.
int64_t strideIndex = staticStrides.size() - 1;
// For contiguous access the strides for the last dimension is always 1.
int64_t prevStride = 1;
for (int64_t i = shapeArr.size() - 1; i >= 0; i--) {
staticStrides[strideIndex] = prevStride;
prevStride = staticStrides[strideIndex--] * shapeArr[i];
}
return success();
}

/// Utility to move 'repeat dimension' with stride 0 and size > 1 to outermost
/// dimension as only that one can support a stride with value 0 in AIE2(+)
/// hardware. But first check that such a dimension is actually the first 'real
Expand Down Expand Up @@ -544,7 +528,11 @@ LogicalResult npuDmaCpyNdOpToAIE(IRRewriter &rewriter,
IRMapping &mapper, IRMapping &bindingsMapper) {
rewriter.setInsertionPoint(dmaOp);
// Convert bidirectional `amdaie.npu.dma_cpy_nd` op into two halves.
if (dmaOp.hasSourceAddressing() || dmaOp.getSourceMemorySpaceAsUInt() == 0) {
if (dmaOp.getSourceMemorySpaceAsUInt() == 0) {
if (!dmaOp.hasSourceAddressing()) {
return dmaOp.emitOpError()
<< "expected source addressing for DMA with source on L3";
}
AMDAIE::BdIdOp bdIdOp = dmaOp.getSourceBdIdOp();
if (!bdIdOp)
return dmaOp.emitOpError() << "expected to have a source BD ID op";
Expand All @@ -555,23 +543,12 @@ LogicalResult npuDmaCpyNdOpToAIE(IRRewriter &rewriter,
SmallVector<int64_t, 4> staticOffsets(4, 0);
SmallVector<int64_t, 4> staticSizes(4, 1);
SmallVector<int64_t, 4> staticStrides(4, 0);
if (dmaOp.hasSourceAddressing()) {
if (failed(getStaticDimsForExplicitAddressing(
dmaOp, dmaOp.getSourceMixedOffsets(), dmaOp.getSourceMixedSizes(),
dmaOp.getSourceMixedStrides(), staticOffsets, staticSizes,
staticStrides))) {
return failure();
}
} else {
MemRefType sourceMemrefType =
cast<LogicalObjectFifoType>(dmaOp.getDmaCpyNdOp().getSourceType())
.getElementType();
if (failed(getStaticDimsForImplicitAddressing(
dmaOp, sourceMemrefType, staticSizes, staticStrides))) {
return failure();
}
if (failed(getStaticDimsForExplicitAddressing(
dmaOp, dmaOp.getSourceMixedOffsets(), dmaOp.getSourceMixedSizes(),
dmaOp.getSourceMixedStrides(), staticOffsets, staticSizes,
staticStrides))) {
return failure();
}

if (failed(canonicalizeNpuStridedPatternForAIE(staticOffsets, staticSizes,
staticStrides))) {
return dmaOp.emitError() << "could not canonicalize for AIE";
Expand All @@ -592,7 +569,11 @@ LogicalResult npuDmaCpyNdOpToAIE(IRRewriter &rewriter,
empty, empty, staticOffsets, staticSizes, staticStrides,
objFifo.getName(), bdIdOp.getValue(), issueToken);
}
if (dmaOp.hasTargetAddressing() || dmaOp.getTargetMemorySpaceAsUInt() == 0) {
if (dmaOp.getTargetMemorySpaceAsUInt() == 0) {
if (!dmaOp.hasTargetAddressing()) {
return dmaOp.emitOpError()
<< "expected target addressing for DMA with target on L3";
}
AMDAIE::BdIdOp bdIdOp = dmaOp.getTargetBdIdOp();
if (!bdIdOp)
return dmaOp.emitOpError() << "expected to have a target BD ID op";
Expand All @@ -603,23 +584,12 @@ LogicalResult npuDmaCpyNdOpToAIE(IRRewriter &rewriter,
SmallVector<int64_t, 4> staticOffsets(4, 0);
SmallVector<int64_t, 4> staticSizes(4, 1);
SmallVector<int64_t, 4> staticStrides(4, 0);
if (dmaOp.hasTargetAddressing()) {
if (failed(getStaticDimsForExplicitAddressing(
dmaOp, dmaOp.getTargetMixedOffsets(), dmaOp.getTargetMixedSizes(),
dmaOp.getTargetMixedStrides(), staticOffsets, staticSizes,
staticStrides))) {
return failure();
}
} else {
MemRefType targetMemrefType =
cast<LogicalObjectFifoType>(dmaOp.getDmaCpyNdOp().getTargetType())
.getElementType();
if (failed(getStaticDimsForImplicitAddressing(
dmaOp, targetMemrefType, staticSizes, staticStrides))) {
return failure();
}
if (failed(getStaticDimsForExplicitAddressing(
dmaOp, dmaOp.getTargetMixedOffsets(), dmaOp.getTargetMixedSizes(),
dmaOp.getTargetMixedStrides(), staticOffsets, staticSizes,
staticStrides))) {
return failure();
}

if (failed(canonicalizeNpuStridedPatternForAIE(staticOffsets, staticSizes,
staticStrides))) {
return dmaOp.emitError() << "could not canonicalize for AIE";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,8 @@ std::unique_ptr<Pass> createAMDAIEBufferizeToAllocationPass(
std::unique_ptr<Pass> createAMDAIECanonicalizeDmaPass();

/// Create pass to canonicalize doubly strided operations.
std::unique_ptr<Pass> createAMDAIECanonicalizeDoublyStridedOpPass();
std::unique_ptr<Pass> createAMDAIECanonicalizeDoublyStridedOpPass(
AMDAIECanonicalizeDoublyStridedOpOptions options = {});

/// Pass to unroll the loops within the control code regions.
std::unique_ptr<Pass> createAMDAIEControlCodeLoopUnrollPass();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,10 @@ def AMDAIECanonicalizeDoublyStridedOp :
Pass<"iree-amdaie-canonicalize-doubly-strided-op", ""> {
let summary = "Canonicalize doubly strided DMA operations.";
let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIECanonicalizeDoublyStridedOpPass()";
let options = [
Option<"foldSingleDims", "fold-single-dims", "bool", /*default=*/"false",
"Whether to fold single strided dimensions and make then implicit.">
];
}

def AMDAIECleanup :
Expand Down
Loading

0 comments on commit 408ed22

Please sign in to comment.