Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
Yu-Zhewen committed Dec 16, 2024
1 parent f7cd097 commit 3922f33
Show file tree
Hide file tree
Showing 5 changed files with 92 additions and 31 deletions.
25 changes: 25 additions & 0 deletions compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -1022,6 +1022,31 @@ def AMDAIE_NpuWriteBdOp: AMDAIE_Op<"npu.write_bd"> {
let assemblyFormat = [{ attr-dict }];
}

def AMDAIE_NpuTctSyncOp: AMDAIE_Op<"npu.tct_sync"> {
let summary = "Wait for the TCTs to be emitted.";
let description = [{
This NPU controller operation to synchronize the Task Completion Tokens (TCTs)
on the specified `channel` and `direction`. The ranges of tiles to synchronize
are defined by [col, col+col_num) and [row, row+row_num).

Example:

```mlir
amdaie.npu.tct_sync {col = 0 : ui32, row = 0 : ui32, channel = 0 : ui32,
direction = 1 : i32, col_num = 1 : ui32, row_num = 1 : ui32}
```
}];
let arguments = (
ins UI32Attr:$col,
UI32Attr:$row,
DMAChannelDir:$direction,
UI32Attr:$channel,
UI32Attr:$col_num,
UI32Attr:$row_num
);
let assemblyFormat = [{ attr-dict }];
}

//===----------------------------------------------------------------------===//
// IREE AMDAIE LogicalObjectFifo Ops
//===----------------------------------------------------------------------===//
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,31 @@ struct HalfDmaCpyNdToNpuConverter final
uint8_t minStrideBitWidth;
};

struct DmaWaitToTctSyncConverter final
: OpConversionPattern<AMDAIE::NpuDmaWaitOp> {
using OpConversionPattern::OpConversionPattern;

LogicalResult matchAndRewrite(
AMDAIE::NpuDmaWaitOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
LLVM_DEBUG(llvm::dbgs() << "matchAndRewrite[AMDAIE::NpuDmaWaitOp]\n");

for (Value token : op.getAsyncTokens()) {
auto pushToQueueOp =
dyn_cast_if_present<AMDAIE::NpuPushToQueueOp>(token.getDefiningOp());
if (!pushToQueueOp) {
return op.emitOpError()
<< "should operate on an `amdaie.push_to_queue` op";
}
rewriter.create<AMDAIE::NpuTctSyncOp>(
op.getLoc(), pushToQueueOp.getCol(), pushToQueueOp.getRow(),
pushToQueueOp.getDirection(), pushToQueueOp.getChannel(), 1, 1);
}
rewriter.eraseOp(op);
return success();
}
};

namespace {
class AMDAIEControlCodeLoweringPass
: public impl::AMDAIEControlCodeLoweringBase<
Expand All @@ -260,17 +285,37 @@ void AMDAIEControlCodeLoweringPass::runOnOperation() {
"ops.";
return signalPassFailure();
}
AMDAIE::AMDAIEDeviceModel deviceModel =
AMDAIE::getDeviceModel(maybeDevice.value());

RewritePatternSet patterns(context);
ConversionTarget conversionTarget(*context);
conversionTarget.addLegalDialect<AMDAIEDialect>();
conversionTarget.addIllegalOp<AMDAIE::NpuHalfDmaCpyNdOp>();
patterns.insert<HalfDmaCpyNdToNpuConverter>(context, deviceModel);
if (failed(applyPartialConversion(parentOp, conversionTarget,
std::move(patterns)))) {
return signalPassFailure();
// First conversion: HalfDmaCpyNdOp to WriteBdOp, AddressPatchOp and
// PushToQueueOp.
{
AMDAIE::AMDAIEDeviceModel deviceModel =
AMDAIE::getDeviceModel(maybeDevice.value());
RewritePatternSet patterns(context);
ConversionTarget conversionTarget(*context);
conversionTarget.addLegalDialect<AMDAIEDialect>();
conversionTarget.addIllegalOp<AMDAIE::NpuHalfDmaCpyNdOp>();
patterns.insert<HalfDmaCpyNdToNpuConverter>(context, deviceModel);

if (failed(applyPartialConversion(parentOp, conversionTarget,
std::move(patterns)))) {
return signalPassFailure();
}
}

// Second conversion: DmaWaitOp to TctSyncOp.
// The two conversions are separate to simplify the attribute handling, such
// as col, row, direction, channel, etc.
{
RewritePatternSet patterns(context);
ConversionTarget conversionTarget(*context);
conversionTarget.addLegalDialect<AMDAIEDialect>();
conversionTarget.addIllegalOp<AMDAIE::NpuDmaWaitOp>();
patterns.insert<DmaWaitToTctSyncConverter>(context);
if (failed(applyPartialConversion(parentOp, conversionTarget,
std::move(patterns)))) {
return signalPassFailure();
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -199,20 +199,11 @@ LogicalResult convertOp(AMDAIE::NpuAddressPatchOp op,
return success();
}

LogicalResult convertOp(AMDAIE::NpuDmaWaitOp op, TransactionBuilder &builder) {
for (Value token : op.getAsyncTokens()) {
auto pushToQueueOp =
dyn_cast_if_present<AMDAIE::NpuPushToQueueOp>(token.getDefiningOp());
if (!pushToQueueOp) {
return op.emitOpError()
<< "should operate on an `amdaie.push_to_queue` op";
}
if (failed(builder.appendTCTSync(
pushToQueueOp.getCol(), pushToQueueOp.getRow(),
static_cast<uint32_t>(pushToQueueOp.getDirection()), 1, 1,
pushToQueueOp.getChannel()))) {
return failure();
}
LogicalResult convertOp(AMDAIE::NpuTctSyncOp op, TransactionBuilder &builder) {
if (failed(builder.appendTCTSync(
op.getCol(), op.getRow(), static_cast<uint32_t>(op.getDirection()),
op.getRowNum(), op.getColNum(), op.getChannel()))) {
return failure();
}
return success();
}
Expand Down Expand Up @@ -274,7 +265,7 @@ LogicalResult controlCodeToTransaction(IRRewriter &rewriter,
WalkResult res = controlCodeOp->walk([&](Operation *op) {
LogicalResult switchResult =
TypeSwitch<Operation *, LogicalResult>(op)
.Case<AMDAIE::NpuAddressPatchOp, AMDAIE::NpuDmaWaitOp,
.Case<AMDAIE::NpuAddressPatchOp, AMDAIE::NpuTctSyncOp,
AMDAIE::NpuPushToQueueOp, AMDAIE::NpuWriteBdOp>(
[&](auto npuOp) {
if (failed(convertOp(npuOp, builder))) return failure();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,13 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 2048 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 0, 0, 2048>, strides = array<i32: 0, 0, 1>, use_next_bd = false, valid_bd = true}
// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32}
// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token)
// CHECK: amdaie.npu.tct_sync {channel = 0 : ui32, col = 0 : ui32, col_num = 1 : ui32, direction = 1 : i32, row = 0 : ui32, row_num = 1 : ui32}
%6 = amdaie.npu.half_dma_cpy_nd async %4(%5[0] [2048] [1] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
amdaie.npu.dma_wait(%6 : !amdaie.async_token)
// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 1024 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 4, 16, 16>, strides = array<i32: 64, 8, 1>, use_next_bd = false, valid_bd = true}
// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32}
// CHECK: %[[TOKEN_1:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 2 : ui32, row = 0 : ui32}
// CHECK: amdaie.npu.dma_wait(%[[TOKEN_1]] : !amdaie.async_token)
// CHECK: amdaie.npu.tct_sync {channel = 0 : ui32, col = 0 : ui32, col_num = 1 : ui32, direction = 1 : i32, row = 0 : ui32, row_num = 1 : ui32}
%7 = amdaie.npu.half_dma_cpy_nd async %4(%5[0, 0, 0, 0] [2, 4, 16, 16] [0, 64, 8, 1] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
amdaie.npu.dma_wait(%7 : !amdaie.async_token)
amdaie.end
Expand Down Expand Up @@ -113,13 +113,13 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 1024 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 0, 0, 1024>, strides = array<i32: 0, 0, 1>, use_next_bd = false, valid_bd = true}
// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32}
// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token)
// CHECK: amdaie.npu.tct_sync {channel = 0 : ui32, col = 0 : ui32, col_num = 1 : ui32, direction = 1 : i32, row = 0 : ui32, row_num = 1 : ui32}
%6 = amdaie.npu.half_dma_cpy_nd async %4(%5[0] [2048] [1] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xbf16>>
amdaie.npu.dma_wait(%6 : !amdaie.async_token)
// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 512 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 4, 16, 8>, strides = array<i32: 32, 4, 1>, use_next_bd = false, valid_bd = true}
// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 64 : ui32}
// CHECK: %[[TOKEN_1:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 2 : ui32, row = 0 : ui32}
// CHECK: amdaie.npu.dma_wait(%[[TOKEN_1]] : !amdaie.async_token)
// CHECK: amdaie.npu.tct_sync {channel = 0 : ui32, col = 0 : ui32, col_num = 1 : ui32, direction = 1 : i32, row = 0 : ui32, row_num = 1 : ui32}
%7 = amdaie.npu.half_dma_cpy_nd async %4(%5[0, 0, 0, 32] [2, 4, 16, 16] [0, 64, 8, 1] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xbf16>>
amdaie.npu.dma_wait(%7 : !amdaie.async_token)
amdaie.end
Expand Down Expand Up @@ -169,7 +169,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
// CHECK: amdaie.npu.write_bd {bd_id = 2 : ui32, buffer_length = 1024 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = false, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 4, 16, 16>, strides = array<i32: 64, 8, 1>, use_next_bd = false, valid_bd = true}
// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 2 : ui32, col = 0 : ui32, offset = 0 : ui32}
// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 2 : ui32, row = 0 : ui32}
// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token)
// CHECK: amdaie.npu.tct_sync {channel = 0 : ui32, col = 0 : ui32, col_num = 1 : ui32, direction = 1 : i32, row = 0 : ui32, row_num = 1 : ui32}
%6 = amdaie.npu.half_dma_cpy_nd async %4(%5[0, 0, 0, 0] [2, 4, 16, 16] [0, 64, 8, 1] bd_id = %bd_id_2 channel = %channel start_bd = %bd_id) : !amdaie.logicalobjectfifo<memref<2048xi32>>
amdaie.npu.dma_wait(%6 : !amdaie.async_token)
amdaie.end
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
amdaie.workgroup {
amdaie.controlcode {
%0 = amdaie.npu.push_to_queue async {bd_id = 15 : ui32, channel = 0 : ui32, col = 2 : ui32, direction = 1 : i32, repeat_count = 256 : ui32, row = 0 : ui32}
amdaie.npu.dma_wait(%0 : !amdaie.async_token)
amdaie.npu.tct_sync {channel = 0 : ui32, col = 2 : ui32, col_num = 1 : ui32, direction = 1 : i32, row = 0 : ui32, row_num = 1 : ui32}
amdaie.end
}
}
Expand Down

0 comments on commit 3922f33

Please sign in to comment.