Skip to content

Commit

Permalink
[DmaLoopSubsumption] Fix for strided op in loop without induction var…
Browse files Browse the repository at this point in the history
… dependency (#570)

Fixes an issue exposed by a 128x32x64 matmul:
#556.

This issue would come up in partial loop dependencies in case of
`scf.forall`:

```
scf.forall (%arg2, %arg3) in (2, 6) {
  %1 = amdaie.npu.dma_cpy_nd %0([0, %arg3] [8, 16] [16, 1], [] [] [])
  amdaie.npu.dma_wait(%2, S2MM)
}
```

In this case, the outer iteration upon which the NPU DMA operation
didn't depend ( `%arg2`) would not be considered, resulting in incorrect
output IR:

```
%1 = amdaie.npu.dma_cpy_nd %0([0, 0, 0] [6, 8, 16] [1, 16, 1], [] [] [])
amdaie.npu.dma_wait(%2, S2MM)
```

However, the outer iteration should be considered as well, making sure
the above DMA operation is executed twice, resulting in the below output
IR:

```
%1 = amdaie.npu.dma_cpy_nd %0([0, 0, 0, 0] [2, 6, 8, 16] [0, 1, 16, 1], [] [] [])
amdaie.npu.dma_wait(%2, S2MM)
```

This PR fixes the issue by adding general support for subsuming loop
iterations into strided operations without any loop induction variable
dependency.
  • Loading branch information
jtuyls authored Jul 18, 2024
1 parent 05368d0 commit a990f77
Show file tree
Hide file tree
Showing 4 changed files with 268 additions and 135 deletions.
14 changes: 12 additions & 2 deletions build_tools/ci/run_matmul_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -735,6 +735,7 @@ run_matmul_test \
--acc_type "f32" \
--m "8192" --k "2432" --n "7296"

###################################################################
# ObjectFifo Matmul tests
###################################################################

Expand All @@ -744,15 +745,24 @@ run_matmul_test \
--tile_pipeline "pack-peel" \
--lhs_rhs_type "i32" \
--acc_type "i32" \
--m "128" --k "256" --n "128"
--m "32" --k "32" --n "32"

run_matmul_test \
--name_prefix "small" \
--lower_to_aie_pipeline "objectFifo" \
--tile_pipeline "pack-peel" \
--lhs_rhs_type "i32" \
--acc_type "i32" \
--m "32" --k "32" --n "32"
--m "128" --k "32" --n "64" \
--num_repeat_runs "10"

run_matmul_test \
--name_prefix "small" \
--lower_to_aie_pipeline "objectFifo" \
--tile_pipeline "pack-peel" \
--lhs_rhs_type "i32" \
--acc_type "i32" \
--m "128" --k "256" --n "128"

run_matmul_test \
--name_prefix "medium" \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,18 @@

namespace mlir::iree_compiler::AMDAIE {

/// Utility to calculate the number of iterations of a loop with provided bounds
/// and step: `ceilDiv(upperBound - lowerBound, step)`.
int64_t calculateNbIterations(int64_t lowerBound, int64_t upperBound,
int64_t step) {
int64_t diff = upperBound - lowerBound;
assert(diff > 0 &&
"expected positive difference between upper bound and lower "
"bound");
assert(step > 0 && "expected positive step");
return 1 + ((diff - 1) / step);
}

// Constant specifying the number of inter-iteration dimension for DMA
// operations.
//
Expand Down Expand Up @@ -117,13 +129,23 @@ class SubsumeLoopIntoDMA
using OpInterfaceRewritePattern::OpInterfaceRewritePattern;

/// Utility to add a loop iteration to an offsets/sizes/strides access
/// pattern.
/// pattern. This function handles following cases:
/// 1. If an offset which has an loop induction variable dependency can be
/// found, calculate the stride and size based on the dependency, potentially
/// taking into account an affine expression multiplier.
/// 2. If there is no loop induction variable dependency, the iteration means
/// that this strided operation is repeated `ceilDiv(upperBound - lowerBound,
/// step)` number of times, so a new dimension is added to the access pattern
/// with `stride == 0` and `size == ceilDiv(upperBound - lowerBound, step)`.
LogicalResult addIterationToAccessPattern(
RewriterBase &rewriter, int64_t lowerBound, int64_t upperBound,
int64_t step, const DenseSet<Value> &inductionValues,
SmallVector<OpFoldResult> &newOffsets,
SmallVector<OpFoldResult> &newSizes,
SmallVector<OpFoldResult> &newStrides) const {
const int64_t nbIterations =
calculateNbIterations(lowerBound, upperBound, step);

SmallVector<OpFoldResult> insertOffsets;
SmallVector<OpFoldResult> insertSizes;
SmallVector<OpFoldResult> insertStrides;
Expand Down Expand Up @@ -159,24 +181,31 @@ class SubsumeLoopIntoDMA

newOffsets[i] = getAsIndexOpFoldResult(rewriter.getContext(),
lowerBound * offsetStride);

// Don't add a unit iteration to better use available dimensions.
// However, the current offset should be updated, therefore this check
// is placed after `newOffsets[i]` has been updated.
if (nbIterations == 1) continue;

insertOffsets.push_back(
getAsIndexOpFoldResult(rewriter.getContext(), 0));

// The step size is equal to the the number of iterations
// (ceilDiv(upperBound - lowerBound, step))
int64_t diff = upperBound - lowerBound;
assert(diff > 0 &&
"expected positive difference between upper bound and lower "
"bound");
assert(step > 0 && "expected positive step");
int64_t newSize = 1 + ((diff - 1) / step);
insertSizes.push_back(
getAsIndexOpFoldResult(rewriter.getContext(), newSize));

getAsIndexOpFoldResult(rewriter.getContext(), nbIterations));
insertStrides.push_back(
getAsIndexOpFoldResult(rewriter.getContext(), stride));
}
}
assert(insertOffsets.size() == insertSizes.size() &&
"expected same number of offsets and sizes to be inserted");
assert(insertOffsets.size() == insertStrides.size() &&
"expected same number of offsets and strides to be inserted");
// Handle the 'no loop dependency' case.
if (insertOffsets.empty() && nbIterations != 1) {
insertOffsets.push_back(getAsIndexOpFoldResult(rewriter.getContext(), 0));
insertSizes.push_back(
getAsIndexOpFoldResult(rewriter.getContext(), nbIterations));
insertStrides.push_back(getAsIndexOpFoldResult(rewriter.getContext(), 0));
}
newOffsets.insert(newOffsets.begin(), insertOffsets.begin(),
insertOffsets.end());
newSizes.insert(newSizes.begin(), insertSizes.begin(), insertSizes.end());
Expand Down Expand Up @@ -206,40 +235,38 @@ class SubsumeLoopIntoDMA
SmallVector<OpFoldResult> newTargetSizes = op.getTargetMixedSizes();
SmallVector<OpFoldResult> newTargetStrides = op.getTargetMixedStrides();

// Use source/target maxNbDims to check whether there are sufficient source
// and target dimensions. Otherwise, abort.
auto verifyNbDimsNeeded = [&](const SmallVector<Value> &dynamicOffsets,
size_t nbOffsets,
size_t maxNbDims) -> LogicalResult {
size_t counter = 0;
for (Value offset : dynamicOffsets)
if (allInductionValues.contains(offset)) counter++;
if (nbOffsets + counter > maxNbDims) return failure();
return success();
};
SmallVector<Value> dynamicSourceOffsets = op.getSourceOffsets();
SmallVector<Value> dynamicTargetOffsets = op.getTargetOffsets();
if (failed(verifyNbDimsNeeded(dynamicSourceOffsets, newSourceOffsets.size(),
sourceMaxNbDims)))
// Verify number of dimensions needed to subsume this loop into the strided
// access pattern and fail early if there aren't enough dimensions.
size_t nbNonUnitIterations{0};
for (auto &&[lb, ub, step] : llvm::zip(lowerBounds, upperBounds, steps)) {
const int64_t nbIterations = calculateNbIterations(lb, ub, step);
// We should not do any rewrite if we encounter a loop with no iterations.
if (nbIterations == 0) return failure();
if (nbIterations > 1) nbNonUnitIterations++;
}
if (newSourceOffsets.size() + nbNonUnitIterations > sourceMaxNbDims)
return failure();
if (failed(verifyNbDimsNeeded(dynamicTargetOffsets, newTargetOffsets.size(),
targetMaxNbDims)))
if (newTargetOffsets.size() + nbNonUnitIterations > targetMaxNbDims)
return failure();

// Add the loop iterations to the DMA access patterns.
for (auto &&[lb, ub, step, iterationIvValues] : llvm::reverse(
llvm::zip(lowerBounds, upperBounds, steps, inductionValues))) {
// Add loop iteration to the access pattern on the source side.
if (failed(addIterationToAccessPattern(
rewriter, lb, ub, step, iterationIvValues, newSourceOffsets,
newSourceSizes, newSourceStrides))) {
return failure();
if (!newSourceOffsets.empty()) {
if (failed(addIterationToAccessPattern(
rewriter, lb, ub, step, iterationIvValues, newSourceOffsets,
newSourceSizes, newSourceStrides))) {
return failure();
}
}
// Add loop iteration to the access pattern on the target side.
if (failed(addIterationToAccessPattern(
rewriter, lb, ub, step, iterationIvValues, newTargetOffsets,
newTargetSizes, newTargetStrides))) {
return failure();
if (!newTargetOffsets.empty()) {
if (failed(addIterationToAccessPattern(
rewriter, lb, ub, step, iterationIvValues, newTargetOffsets,
newTargetSizes, newTargetStrides))) {
return failure();
}
}
}

Expand Down Expand Up @@ -290,11 +317,6 @@ class SubsumeLoopIntoDMA
curIvValues.insert(userApplyOp.getResult());
}
}
if (!llvm::any_of(op->getOperands(), [&](Value operand) {
return curIvValues.contains(operand);
})) {
return failure();
}

SmallVector<int64_t> lowerBounds = {lowerBound.value()};
SmallVector<int64_t> upperBounds = {upperBound.value()};
Expand Down Expand Up @@ -342,13 +364,6 @@ class SubsumeLoopIntoDMA
}
inductionValues.push_back(curIvValues);
}
// Return early if the strided operation doesn't use any of the
// induction variable dependent values.
if (!llvm::any_of(op->getOperands(), [&](Value operand) {
return allInductionValues.contains(operand);
})) {
return failure();
}
return rewriteWithLoopLikeOpParent(op, rewriter, sourceMaxNbDims,
targetMaxNbDims, lowerBounds.value(),
upperBounds.value(), steps.value(),
Expand Down
Loading

0 comments on commit a990f77

Please sign in to comment.