Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Draft] Add NpuTctSync operation #990

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -1022,6 +1022,31 @@ def AMDAIE_NpuWriteBdOp: AMDAIE_Op<"npu.write_bd"> {
let assemblyFormat = [{ attr-dict }];
}

def AMDAIE_NpuTctSyncOp: AMDAIE_Op<"npu.tct_sync"> {
let summary = "Wait for the TCTs to be emitted.";
let description = [{
This NPU controller operation to synchronize the Task Completion Tokens (TCTs)
on the specified `channel` and `direction`. The ranges of tiles to synchronize
are defined by [col, col+col_num) and [row, row+row_num).

Example:

```mlir
amdaie.npu.tct_sync {col = 0 : ui32, row = 0 : ui32, channel = 0 : ui32,
direction = 1 : i32, col_num = 1 : ui32, row_num = 1 : ui32}
```
}];
let arguments = (
ins UI32Attr:$col,
UI32Attr:$row,
DMAChannelDir:$direction,
UI32Attr:$channel,
UI32Attr:$col_num,
UI32Attr:$row_num
);
let assemblyFormat = [{ attr-dict }];
}

//===----------------------------------------------------------------------===//
// IREE AMDAIE LogicalObjectFifo Ops
//===----------------------------------------------------------------------===//
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,31 @@ struct HalfDmaCpyNdToNpuConverter final
uint8_t minStrideBitWidth;
};

struct DmaWaitToTctSyncConverter final
: OpConversionPattern<AMDAIE::NpuDmaWaitOp> {
using OpConversionPattern::OpConversionPattern;

LogicalResult matchAndRewrite(
AMDAIE::NpuDmaWaitOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
LLVM_DEBUG(llvm::dbgs() << "matchAndRewrite[AMDAIE::NpuDmaWaitOp]\n");

for (Value token : op.getAsyncTokens()) {
auto pushToQueueOp =
dyn_cast_if_present<AMDAIE::NpuPushToQueueOp>(token.getDefiningOp());
if (!pushToQueueOp) {
return op.emitOpError()
<< "should operate on an `amdaie.push_to_queue` op";
}
rewriter.create<AMDAIE::NpuTctSyncOp>(
op.getLoc(), pushToQueueOp.getCol(), pushToQueueOp.getRow(),
pushToQueueOp.getDirection(), pushToQueueOp.getChannel(), 1, 1);
}
rewriter.eraseOp(op);
return success();
}
};

namespace {
class AMDAIEControlCodeLoweringPass
: public impl::AMDAIEControlCodeLoweringBase<
Expand All @@ -260,17 +285,37 @@ void AMDAIEControlCodeLoweringPass::runOnOperation() {
"ops.";
return signalPassFailure();
}
AMDAIE::AMDAIEDeviceModel deviceModel =
AMDAIE::getDeviceModel(maybeDevice.value());

RewritePatternSet patterns(context);
ConversionTarget conversionTarget(*context);
conversionTarget.addLegalDialect<AMDAIEDialect>();
conversionTarget.addIllegalOp<AMDAIE::NpuHalfDmaCpyNdOp>();
patterns.insert<HalfDmaCpyNdToNpuConverter>(context, deviceModel);
if (failed(applyPartialConversion(parentOp, conversionTarget,
std::move(patterns)))) {
return signalPassFailure();
// First conversion: HalfDmaCpyNdOp to WriteBdOp, AddressPatchOp and
// PushToQueueOp.
{
AMDAIE::AMDAIEDeviceModel deviceModel =
AMDAIE::getDeviceModel(maybeDevice.value());
RewritePatternSet patterns(context);
ConversionTarget conversionTarget(*context);
conversionTarget.addLegalDialect<AMDAIEDialect>();
conversionTarget.addIllegalOp<AMDAIE::NpuHalfDmaCpyNdOp>();
patterns.insert<HalfDmaCpyNdToNpuConverter>(context, deviceModel);

if (failed(applyPartialConversion(parentOp, conversionTarget,
std::move(patterns)))) {
return signalPassFailure();
}
}

// Second conversion: DmaWaitOp to TctSyncOp.
// The two conversions are separate to simplify the attribute handling, such
// as col, row, direction, channel, etc.
{
RewritePatternSet patterns(context);
ConversionTarget conversionTarget(*context);
conversionTarget.addLegalDialect<AMDAIEDialect>();
conversionTarget.addIllegalOp<AMDAIE::NpuDmaWaitOp>();
patterns.insert<DmaWaitToTctSyncConverter>(context);
if (failed(applyPartialConversion(parentOp, conversionTarget,
std::move(patterns)))) {
return signalPassFailure();
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -199,20 +199,11 @@ LogicalResult convertOp(AMDAIE::NpuAddressPatchOp op,
return success();
}

LogicalResult convertOp(AMDAIE::NpuDmaWaitOp op, TransactionBuilder &builder) {
for (Value token : op.getAsyncTokens()) {
auto pushToQueueOp =
dyn_cast_if_present<AMDAIE::NpuPushToQueueOp>(token.getDefiningOp());
if (!pushToQueueOp) {
return op.emitOpError()
<< "should operate on an `amdaie.push_to_queue` op";
}
if (failed(builder.appendTCTSync(
pushToQueueOp.getCol(), pushToQueueOp.getRow(),
static_cast<uint32_t>(pushToQueueOp.getDirection()), 1, 1,
pushToQueueOp.getChannel()))) {
return failure();
}
LogicalResult convertOp(AMDAIE::NpuTctSyncOp op, TransactionBuilder &builder) {
if (failed(builder.appendTCTSync(
op.getCol(), op.getRow(), static_cast<uint32_t>(op.getDirection()),
op.getRowNum(), op.getColNum(), op.getChannel()))) {
return failure();
}
return success();
}
Expand Down Expand Up @@ -274,7 +265,7 @@ LogicalResult controlCodeToTransaction(IRRewriter &rewriter,
WalkResult res = controlCodeOp->walk([&](Operation *op) {
LogicalResult switchResult =
TypeSwitch<Operation *, LogicalResult>(op)
.Case<AMDAIE::NpuAddressPatchOp, AMDAIE::NpuDmaWaitOp,
.Case<AMDAIE::NpuAddressPatchOp, AMDAIE::NpuTctSyncOp,
AMDAIE::NpuPushToQueueOp, AMDAIE::NpuWriteBdOp>(
[&](auto npuOp) {
if (failed(convertOp(npuOp, builder))) return failure();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,13 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 2048 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 0, 0, 2048>, strides = array<i32: 0, 0, 1>, use_next_bd = false, valid_bd = true}
// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32}
// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token)
// CHECK: amdaie.npu.tct_sync {channel = 0 : ui32, col = 0 : ui32, col_num = 1 : ui32, direction = 1 : i32, row = 0 : ui32, row_num = 1 : ui32}
%6 = amdaie.npu.half_dma_cpy_nd async %4(%5[0] [2048] [1] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
amdaie.npu.dma_wait(%6 : !amdaie.async_token)
// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 1024 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 4, 16, 16>, strides = array<i32: 64, 8, 1>, use_next_bd = false, valid_bd = true}
// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32}
// CHECK: %[[TOKEN_1:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 2 : ui32, row = 0 : ui32}
// CHECK: amdaie.npu.dma_wait(%[[TOKEN_1]] : !amdaie.async_token)
// CHECK: amdaie.npu.tct_sync {channel = 0 : ui32, col = 0 : ui32, col_num = 1 : ui32, direction = 1 : i32, row = 0 : ui32, row_num = 1 : ui32}
%7 = amdaie.npu.half_dma_cpy_nd async %4(%5[0, 0, 0, 0] [2, 4, 16, 16] [0, 64, 8, 1] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
amdaie.npu.dma_wait(%7 : !amdaie.async_token)
amdaie.end
Expand Down Expand Up @@ -113,13 +113,13 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 1024 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 0, 0, 1024>, strides = array<i32: 0, 0, 1>, use_next_bd = false, valid_bd = true}
// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32}
// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token)
// CHECK: amdaie.npu.tct_sync {channel = 0 : ui32, col = 0 : ui32, col_num = 1 : ui32, direction = 1 : i32, row = 0 : ui32, row_num = 1 : ui32}
%6 = amdaie.npu.half_dma_cpy_nd async %4(%5[0] [2048] [1] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xbf16>>
amdaie.npu.dma_wait(%6 : !amdaie.async_token)
// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 512 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 4, 16, 8>, strides = array<i32: 32, 4, 1>, use_next_bd = false, valid_bd = true}
// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 64 : ui32}
// CHECK: %[[TOKEN_1:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 2 : ui32, row = 0 : ui32}
// CHECK: amdaie.npu.dma_wait(%[[TOKEN_1]] : !amdaie.async_token)
// CHECK: amdaie.npu.tct_sync {channel = 0 : ui32, col = 0 : ui32, col_num = 1 : ui32, direction = 1 : i32, row = 0 : ui32, row_num = 1 : ui32}
%7 = amdaie.npu.half_dma_cpy_nd async %4(%5[0, 0, 0, 32] [2, 4, 16, 16] [0, 64, 8, 1] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xbf16>>
amdaie.npu.dma_wait(%7 : !amdaie.async_token)
amdaie.end
Expand Down Expand Up @@ -169,7 +169,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
// CHECK: amdaie.npu.write_bd {bd_id = 2 : ui32, buffer_length = 1024 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = false, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 4, 16, 16>, strides = array<i32: 64, 8, 1>, use_next_bd = false, valid_bd = true}
// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 2 : ui32, col = 0 : ui32, offset = 0 : ui32}
// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 2 : ui32, row = 0 : ui32}
// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token)
// CHECK: amdaie.npu.tct_sync {channel = 0 : ui32, col = 0 : ui32, col_num = 1 : ui32, direction = 1 : i32, row = 0 : ui32, row_num = 1 : ui32}
%6 = amdaie.npu.half_dma_cpy_nd async %4(%5[0, 0, 0, 0] [2, 4, 16, 16] [0, 64, 8, 1] bd_id = %bd_id_2 channel = %channel start_bd = %bd_id) : !amdaie.logicalobjectfifo<memref<2048xi32>>
amdaie.npu.dma_wait(%6 : !amdaie.async_token)
amdaie.end
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
amdaie.workgroup {
amdaie.controlcode {
%0 = amdaie.npu.push_to_queue async {bd_id = 15 : ui32, channel = 0 : ui32, col = 2 : ui32, direction = 1 : i32, repeat_count = 256 : ui32, row = 0 : ui32}
amdaie.npu.dma_wait(%0 : !amdaie.async_token)
amdaie.npu.tct_sync {channel = 0 : ui32, col = 2 : ui32, col_num = 1 : ui32, direction = 1 : i32, row = 0 : ui32, row_num = 1 : ui32}
amdaie.end
}
}
Expand Down
Loading