Skip to content

Commit c607072

Browse files
authored
[CombineStridedOps] Add a combinable case (#839)
This PR adds a corner case for strided op combination. With this PR, the following strides ops: ``` 48 = amdaie.npu.dma_cpy_nd %8([] [] [], %31[0, %45] [32, 64] [128, 1]) : source_type = !amdaie.logicalobjectfifo<memref<16384xi32>> %49 = amdaie.npu.dma_cpy_nd %8([] [] [], %31[32, %45] [96, 64] [128, 1]) : source_type = !amdaie.logicalobjectfifo<memref<16384xi32>> ``` can be combined as `%48 = amdaie.npu.dma_cpy_nd %8([] [] [], %31[0, %45] [128, 64] [128, 1]) : source_type = !amdaie.logicalobjectfifo<memref<16384xi32>> ` Addressed review comments #826 (comment).
1 parent c84cca0 commit c607072

File tree

4 files changed

+120
-33
lines changed

4 files changed

+120
-33
lines changed

compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.cpp

+62-32
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,14 @@ bool areAccessPatternsCombinable(const SmallVector<OpFoldResult> &offsetsA,
6060
}
6161
if (strideA != strideB) return false;
6262
}
63+
64+
// Don't check the outermost dimension of size at this point.
65+
SmallVector<OpFoldResult> innerSizesA;
66+
SmallVector<OpFoldResult> innerSizesB;
67+
std::copy(sizesA.begin() + 1, sizesA.end(), std::back_inserter(innerSizesA));
68+
std::copy(sizesB.begin() + 1, sizesB.end(), std::back_inserter(innerSizesB));
6369
for (auto &&[sizeA, sizeB] :
64-
llvm::zip(llvm::reverse(sizesA), llvm::reverse(sizesB))) {
70+
llvm::zip(llvm::reverse(innerSizesA), llvm::reverse(innerSizesB))) {
6571
std::optional<int64_t> maybeSizeA = getConstantIntValue(sizeA);
6672
std::optional<int64_t> maybeSizeB = getConstantIntValue(sizeB);
6773
// Handle static and constant value with same int value.
@@ -71,6 +77,20 @@ bool areAccessPatternsCombinable(const SmallVector<OpFoldResult> &offsetsA,
7177
if (sizeA != sizeB) return false;
7278
}
7379

80+
// Edge case for sizesA[0] != sizesB[0].
81+
if (offsetsB.size() == offsetsA.size() && sizesA[0] != sizesB[0]) {
82+
std::optional<int64_t> constOffsetA = getConstantIntValue(offsetsA[0]);
83+
std::optional<int64_t> constSizeA = getConstantIntValue(sizesA[0]);
84+
std::optional<int64_t> constOffsetB = getConstantIntValue(offsetsB[0]);
85+
std::optional<int64_t> constSizeB = getConstantIntValue(sizesB[0]);
86+
if (constOffsetA && constOffsetB && constSizeA && constSizeB) {
87+
int64_t offsetDiff = constOffsetB.value() - constOffsetA.value();
88+
if (constSizeA.value() != offsetDiff) return false;
89+
} else {
90+
return false;
91+
}
92+
}
93+
7494
bool foundDiff{false};
7595
for (auto iter : llvm::enumerate(
7696
llvm::zip(llvm::reverse(offsetsA), llvm::reverse(offsetsB)))) {
@@ -169,40 +189,50 @@ LogicalResult combineAccessPatterns(RewriterBase &rewriter,
169189
if (!size) return failure();
170190
newSizes[0] = rewriter.getI64IntegerAttr(size.value() + 1);
171191
} else {
172-
// Sizes are the same, so add a new dimension with 'offset == 0', 'size ==
173-
// 2' and 'stride == offsetDiff'.
174-
newOffsets.push_back(rewriter.getI64IntegerAttr(0));
175-
int64_t offsetDiff;
176-
int64_t strideMultiplier;
177-
for (auto iter : llvm::enumerate(llvm::zip(offsetsA, offsetsB))) {
178-
const OpFoldResult &offsetA = std::get<0>(iter.value());
179-
const OpFoldResult &offsetB = std::get<1>(iter.value());
180-
newOffsets.push_back(offsetA);
181-
if (offsetA != offsetB) {
182-
std::optional<int64_t> constOffsetA = getConstantIntValue(offsetA);
183-
std::optional<int64_t> constOffsetB = getConstantIntValue(offsetB);
184-
if (!constOffsetA || !constOffsetB) {
185-
return emitError(rewriter.getUnknownLoc())
186-
<< "differing offsets should be constants";
187-
}
188-
offsetDiff = constOffsetB.value() - constOffsetA.value();
189-
std::optional<int64_t> maybeStride =
190-
getConstantIntValue(stridesA[iter.index()]);
191-
if (!maybeStride) {
192-
return emitError(rewriter.getUnknownLoc())
193-
<< "no constant stride found at the same index where the "
194-
"offset "
195-
"difference occurs";
192+
// Edge case for sizesA[0] != sizesB[0].
193+
if (sizesA[0] != sizesB[0]) {
194+
newOffsets = offsetsA;
195+
newSizes = sizesA;
196+
newStrides = stridesA;
197+
std::optional<int64_t> sizeA = getConstantIntValue(sizesA[0]);
198+
std::optional<int64_t> sizeB = getConstantIntValue(sizesB[0]);
199+
if (!sizeA || !sizeB) return failure();
200+
newSizes[0] = rewriter.getI64IntegerAttr(sizeA.value() + sizeB.value());
201+
} else {
202+
// All dims of sizes are the same, so add a new dimension with
203+
// 'offset == 0', 'size == 2' and 'stride == offsetDiff'.
204+
newOffsets.push_back(rewriter.getI64IntegerAttr(0));
205+
int64_t offsetDiff;
206+
int64_t strideMultiplier;
207+
for (auto iter : llvm::enumerate(llvm::zip(offsetsA, offsetsB))) {
208+
const OpFoldResult &offsetA = std::get<0>(iter.value());
209+
const OpFoldResult &offsetB = std::get<1>(iter.value());
210+
newOffsets.push_back(offsetA);
211+
if (offsetA != offsetB) {
212+
std::optional<int64_t> constOffsetA = getConstantIntValue(offsetA);
213+
std::optional<int64_t> constOffsetB = getConstantIntValue(offsetB);
214+
if (!constOffsetA || !constOffsetB) {
215+
return emitError(rewriter.getUnknownLoc())
216+
<< "differing offsets should be constants";
217+
}
218+
offsetDiff = constOffsetB.value() - constOffsetA.value();
219+
std::optional<int64_t> maybeStride =
220+
getConstantIntValue(stridesA[iter.index()]);
221+
if (!maybeStride) {
222+
return emitError(rewriter.getUnknownLoc())
223+
<< "no constant stride found at the same index where the "
224+
"offset "
225+
"difference occurs";
226+
}
227+
strideMultiplier = maybeStride.value();
196228
}
197-
strideMultiplier = maybeStride.value();
198229
}
230+
newSizes.push_back(rewriter.getI64IntegerAttr(2));
231+
newSizes.append(sizesA.begin(), sizesA.end());
232+
newStrides.push_back(
233+
rewriter.getI64IntegerAttr(offsetDiff * strideMultiplier));
234+
newStrides.append(stridesA.begin(), stridesA.end());
199235
}
200-
newSizes.push_back(rewriter.getI64IntegerAttr(2));
201-
newSizes.append(sizesA.begin(), sizesA.end());
202-
newStrides.push_back(
203-
rewriter.getI64IntegerAttr(offsetDiff * strideMultiplier));
204-
newStrides.append(stridesA.begin(), stridesA.end());
205-
;
206236
}
207237
assert(newOffsets.size() == newSizes.size() &&
208238
"expected same number of new offsets and sizes");

compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETemporaryAllocBufferization.cpp

-1
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,6 @@ LogicalResult bufferizeTemporaryMemrefs(Operation *parentOp) {
4848
});
4949
}
5050

51-
5251
// Note: we don't erase allocs/deallocs, we leave this for canonicalization.
5352

5453
return success();

compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/AMDAIEDmaUtilsTest.cpp

+14
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,8 @@ TEST_F(AccessPatternCombinationTest, CombinableAccessPatterns) {
111111
EXPECT_TRUE(checkAreAccessPatternsCombinable({0, 2, 0}, {16, 16, 32},
112112
{32, 64, 1}, {0, 2, 32},
113113
{16, 16, 32}, {32, 64, 1}, 4));
114+
EXPECT_TRUE(checkAreAccessPatternsCombinable({32, 0}, {64, 64}, {128, 1},
115+
{96, 0}, {32, 64}, {128, 1}, 4));
114116
// size(A) > size(B)
115117
EXPECT_TRUE(checkAreAccessPatternsCombinable(
116118
{0, 0, 0}, {2, 16, 32}, {32, 64, 1}, {0, 64}, {16, 32}, {64, 1}, 4));
@@ -168,6 +170,12 @@ TEST_F(AccessPatternCombinationTest, NonCombinableAccessPatterns) {
168170
{0, 0}, {16, 32}, {64, 1}, {0, 0, 96}, {2, 16, 32}, {32, 64, 1}, 4));
169171
EXPECT_FALSE(checkAreAccessPatternsCombinable(
170172
{0, 0}, {16, 32}, {64, 1}, {0, 1, 0}, {2, 16, 32}, {32, 64, 1}, 4));
173+
174+
// size(A) == size(B) Incompatible offset
175+
EXPECT_FALSE(checkAreAccessPatternsCombinable(
176+
{32, 0}, {64, 64}, {128, 1}, {32, 0}, {32, 64}, {128, 1}, 4));
177+
EXPECT_FALSE(checkAreAccessPatternsCombinable(
178+
{32, 0}, {32, 64}, {128, 1}, {96, 0}, {64, 64}, {128, 1}, 4));
171179
}
172180

173181
TEST_F(AccessPatternCombinationTest, CombineAccessPatterns) {
@@ -197,6 +205,8 @@ TEST_F(AccessPatternCombinationTest, CombineAccessPatterns) {
197205
checkCombineAccessPatterns({8, 0, 0}, {16, 8, 16}, {16, 8, 1}, {40, 0, 0},
198206
{16, 8, 16}, {16, 8, 1}, {0, 8, 0, 0},
199207
{2, 16, 8, 16}, {512, 16, 8, 1}, 4);
208+
checkCombineAccessPatterns({32, 0}, {64, 64}, {128, 1}, {96, 0}, {32, 64},
209+
{128, 1}, {32, 0}, {96, 64}, {128, 1}, 4);
200210
// size(A) > size(B)
201211
checkCombineAccessPatterns({0, 0}, {2, 32}, {64, 1}, {128}, {32}, {1}, {0, 0},
202212
{3, 32}, {64, 1}, 3);
@@ -255,6 +265,10 @@ TEST_F(AccessPatternCombinationTest, FailCombineAccessPatterns) {
255265
{3, 32}, {64, 1}, 3, false);
256266
checkCombineAccessPatterns({0}, {32}, {1}, {0, 96}, {2, 32}, {64, 1}, {0, 0},
257267
{3, 32}, {64, 1}, 3, false);
268+
269+
// size(A) == size(B) Incompatible offset
270+
checkCombineAccessPatterns({32, 0}, {32, 64}, {128, 1}, {96, 0}, {64, 64},
271+
{128, 1}, {32, 0}, {96, 64}, {128, 1}, 4, false);
258272
}
259273

260274
} // namespace

compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/combine_strided_ops.mlir

+44
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,28 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
230230

231231
// -----
232232

233+
// CHECK-LABEL: @combine_source_same_dims_diff_sizes
234+
// CHECK: %[[CONNECTION:.+]] = amdaie.connection
235+
// CHECK: amdaie.npu.dma_cpy_nd %[[CONNECTION]]([] [] [], [0, 0] [128, 64] [128, 1])
236+
// CHECK-NOT: amdaie.npu.dma_cpy_nd
237+
#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
238+
module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
239+
func.func @combine_source_same_dims_diff_sizes(%arg0: !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>>, %arg1: !amdaie.logicalobjectfifo<memref<128x128xi32>>) {
240+
amdaie.workgroup {
241+
%0 = amdaie.connection(%arg0, %arg1) : (!amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x128xi32>>)
242+
amdaie.controlcode {
243+
amdaie.npu.dma_cpy_nd %0([] [] [], [0, 0] [32, 64] [128, 1])
244+
amdaie.npu.dma_cpy_nd %0([] [] [], [32, 0] [64, 64] [128, 1])
245+
amdaie.npu.dma_cpy_nd %0([] [] [], [96, 0] [32, 64] [128, 1])
246+
amdaie.end
247+
}
248+
}
249+
return
250+
}
251+
}
252+
253+
// -----
254+
233255
// CHECK-LABEL: @combine_source_values
234256
// CHECK: %[[CONNECTION:.+]] = amdaie.connection
235257
// CHECK: amdaie.npu.dma_cpy_nd %[[CONNECTION]]([] [] [], [0, 0, 0, 0] [2, 16, 8, 16] [32, 32, 8, 1])
@@ -332,6 +354,28 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
332354

333355
// -----
334356

357+
// CHECK-LABEL: @combine_target_same_dims_diff_sizes
358+
// CHECK: %[[CONNECTION:.+]] = amdaie.connection
359+
// CHECK: amdaie.npu.dma_cpy_nd %[[CONNECTION]]([0, 0] [128, 64] [128, 1], [] [] [])
360+
// CHECK-NOT: amdaie.npu.dma_cpy_nd
361+
#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
362+
module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
363+
func.func @combine_target_same_dims_diff_sizes(%arg0: !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>>, %arg1: !amdaie.logicalobjectfifo<memref<128x128xi32>>) {
364+
amdaie.workgroup {
365+
%0 = amdaie.connection(%arg0, %arg1) : (!amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x128xi32>>)
366+
amdaie.controlcode {
367+
amdaie.npu.dma_cpy_nd %0([0, 0] [32, 64] [128, 1], [] [] [])
368+
amdaie.npu.dma_cpy_nd %0([32, 0] [64, 64] [128, 1], [] [] [])
369+
amdaie.npu.dma_cpy_nd %0([96, 0] [32, 64] [128, 1], [] [] [])
370+
amdaie.end
371+
}
372+
}
373+
return
374+
}
375+
}
376+
377+
// -----
378+
335379
// CHECK-LABEL: @combine_target_diff_dims
336380
// CHECK: %[[CONNECTION:.+]] = amdaie.connection
337381
// CHECK: amdaie.npu.dma_cpy_nd %[[CONNECTION]]([0, 0, 0, 32] [3, 16, 8, 16] [64, 32, 8, 1], [] [] [])

0 commit comments

Comments
 (0)