diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AMDAIETargetCDODirect.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AMDAIETargetCDODirect.cpp index e4ea07a95..3620e79b5 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AMDAIETargetCDODirect.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AMDAIETargetCDODirect.cpp @@ -159,34 +159,45 @@ LogicalResult configureBdInBlock(XAie_DmaDesc &dmaTileBd, Block &block, std::optional> dims = bdOp.getDimensions(); int lenInBytes = bdOp.getLenInBytes(); int basePlusOffsetInBytes = baseAddr + bdOp.getOffsetInBytes(); + int32_t bufferElementTypeWidthInBytes = + bdOp.getBufferElementTypeWidthInBytes(); + // aie-rt expects multiples of 32b words (see docstring on + // XAie_DmaSetMultiDimAddr). Thus, elementWidthIn32bWords is possibly a + // fraction, e.g. bf16 => elementWidthIn32bWords == 0.5 so that size = 10 => 5 + // 32b words + double elementWidthIn32bWords = + static_cast(bufferElementTypeWidthInBytes) / 4.0; + if (!dims) { TRY_XAIE_API_EMIT_ERROR(bdOp, XAie_DmaSetAddrLen, &dmaTileBd, basePlusOffsetInBytes, lenInBytes); } else { XAie_DmaTensor dmaTileBdTensor = {}; dmaTileBdTensor.NumDim = dims->size(); - dmaTileBdTensor.Dim = static_cast( - calloc(dmaTileBdTensor.NumDim, sizeof(XAie_DmaDimDesc))); - if (!dmaTileBdTensor.Dim) - return bdOp.emitError("couldn't allocate array of XAie_DmaDimDesc"); - // libxaie requires stride in multiples of 32b - double elementWidthIn32bWords = - static_cast(bdOp.getBufferElementTypeWidthInBytes()) / 4.0; + dmaTileBdTensor.Dim = new XAie_DmaDimDesc[dmaTileBdTensor.NumDim]; for (size_t i = 0; i < dims->size(); i++) { // Pass down dimensions in reverse order; in the MLIR, this allows - // us to specify step sizes/wraps in the same order as we would - // access a multi-dim C array, with the highest dimension first. - int j = dims->size() - i - 1; - uint16_t size; - uint32_t stride; + // us to specify step sizes/strides in the same order as we would for + // RankedTensorType/MemRefType. + uint16_t size = dims.value()[i].getSize(); + uint32_t stride = dims.value()[i].getStride(); + size_t j = dims->size() - i - 1; if (j > 0) { - stride = static_cast(dims.value()[i].getStride() * - elementWidthIn32bWords); - size = dims.value()[i].getSize(); + if (stride * bufferElementTypeWidthInBytes % 4 != 0) { + return bdOp.emitOpError("`stride` on dim ") + << i + << ", times element width (in bytes), should " + "be a multiple of 4 bytes"; + } + stride = static_cast(stride * elementWidthIn32bWords); } else { - stride = dims.value()[i].getStride(); - size = static_cast(dims.value()[i].getSize() * - elementWidthIn32bWords); + if (size * bufferElementTypeWidthInBytes % 4 != 0) { + return bdOp.emitOpError("`size` on dim ") + << i + << ", times element width (in bytes), should " + "be a multiple of 4 bytes"; + } + size = static_cast(size * elementWidthIn32bWords); } stride = stride > 0 ? stride : 1; // Assume AIE-ML architecture (ie use AieMlDimDesc instead of AieDimDesc); @@ -204,26 +215,24 @@ LogicalResult configureBdInBlock(XAie_DmaDesc &dmaTileBd, Block &block, if (padDims) { XAie_DmaPadTensor dmaPadTensor = {}; dmaPadTensor.NumDim = padDims->size(); - dmaPadTensor.PadDesc = static_cast( - calloc(dmaPadTensor.NumDim, sizeof(XAie_PadDesc))); - if (!dmaPadTensor.PadDesc) - return bdOp.emitError("couldn't allocate array of XAie_PadDesc"); - // libxaie requires stride in multiples of 32b - double elementWidthIn32bWords = - static_cast(bdOp.getBufferElementTypeWidthInBytes()) / 4.0; + dmaPadTensor.PadDesc = new XAie_PadDesc[dmaPadTensor.NumDim]; for (size_t i = 0; i < padDims->size(); i++) { - // Pass down dimensions in reverse order. - int j = padDims->size() - i - 1; - uint8_t before; - uint8_t after; - if (j > 0) { - before = static_cast(padDims.value()[i].getConstPadBefore()); - after = static_cast(padDims.value()[i].getConstPadAfter()); - } else { - before = static_cast(padDims.value()[i].getConstPadBefore() * - elementWidthIn32bWords); - after = static_cast(padDims.value()[i].getConstPadAfter() * - elementWidthIn32bWords); + uint8_t before = padDims.value()[i].getConstPadBefore(); + uint8_t after = padDims.value()[i].getConstPadAfter(); + size_t j = padDims->size() - i - 1; + if (j == 0) { + if (before * bufferElementTypeWidthInBytes % 4 != 0) { + return bdOp.emitOpError( + "`before` padding on inner-most dim, times element width (in " + "bytes), should be a multiple of 4 bytes"); + } + if (after * bufferElementTypeWidthInBytes % 4 != 0) { + return bdOp.emitOpError( + "`after` padding on inner-most dim, times element width (in " + "bytes), should be a multiple of 4 bytes"); + } + before = static_cast(before * elementWidthIn32bWords); + after = static_cast(after * elementWidthIn32bWords); } dmaPadTensor.PadDesc[j] = {before, after}; } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/cdo/add_12_i8_using_2d_dma_op_with_padding.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/cdo/add_12_i8_using_2d_dma_op_with_padding.mlir index 61da0335c..745f4e348 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/cdo/add_12_i8_using_2d_dma_op_with_padding.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/cdo/add_12_i8_using_2d_dma_op_with_padding.mlir @@ -2,35 +2,8 @@ module { aie.device(npu1_1col) { - memref.global "public" @objFifo_in0 : memref<56x56xi8> - memref.global "public" @objFifo_out0 : memref<64x64xi8> - %tile_0_0 = aie.tile(0, 0) %tile_0_1 = aie.tile(0, 1) - %tile_0_2 = aie.tile(0, 2) - %objFifo_in1_cons_buff_0 = aie.buffer(%tile_0_2) {address = 0 : i32, sym_name = "objFifo_in1_cons_buff_0"} : memref<64x64xi8> - %objFifo_in1_cons_buff_1 = aie.buffer(%tile_0_2) {address = 4096 : i32, sym_name = "objFifo_in1_cons_buff_1"} : memref<64x64xi8> - %objFifo_out1_buff_0 = aie.buffer(%tile_0_2) {address = 8192 : i32, sym_name = "objFifo_out1_buff_0"} : memref<64x64xi8> - %objFifo_out1_buff_1 = aie.buffer(%tile_0_2) {address = 12288 : i32, sym_name = "objFifo_out1_buff_1"} : memref<64x64xi8> - aie.flow(%tile_0_0, DMA : 0, %tile_0_1, DMA : 0) - aie.flow(%tile_0_1, DMA : 0, %tile_0_2, DMA : 0) - aie.flow(%tile_0_1, DMA : 1, %tile_0_0, DMA : 0) - aie.flow(%tile_0_2, DMA : 0, %tile_0_1, DMA : 1) - aie.shim_dma_allocation @objFifo_in0(MM2S, 0, 0) - func.func @bobsyouruncle(%arg0: memref<61x56xi8>, %arg1: memref<32xi8>, %arg2: memref<64x64xi8>) { - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c56_i64 = arith.constant 56 : i64 - %c61_i64 = arith.constant 61 : i64 - %c64_i64 = arith.constant 64 : i64 - aiex.npu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c61_i64, %c56_i64][%c0_i64, %c0_i64, %c56_i64, %c1_i64]) {id = 0 : i64, metadata = @objFifo_in0} : memref<61x56xi8> - aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c0_i64, %c64_i64, %c1_i64]) {id = 1 : i64, metadata = @objFifo_out0} : memref<64x64xi8> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} - return - } %objFifo_in0_cons_buff_0 = aie.buffer(%tile_0_1) {address = 0 : i32, sym_name = "objFifo_in0_cons_buff_0"} : memref<64x64xi8> - %objFifo_in0_cons_buff_1 = aie.buffer(%tile_0_1) {address = 4096 : i32, sym_name = "objFifo_in0_cons_buff_1"} : memref<64x64xi8> - %objFifo_out0_buff_0 = aie.buffer(%tile_0_1) {address = 8192 : i32, sym_name = "objFifo_out0_buff_0"} : memref<64x64xi8> - %objFifo_out0_buff_1 = aie.buffer(%tile_0_1) {address = 12288 : i32, sym_name = "objFifo_out0_buff_1"} : memref<64x64xi8> %objFifo_in0_cons_prod_lock = aie.lock(%tile_0_1, 0) {init = 1 : i32, sym_name = "objFifo_in0_cons_prod_lock"} %objFifo_in0_cons_cons_lock = aie.lock(%tile_0_1, 1) {init = 0 : i32, sym_name = "objFifo_in0_cons_cons_lock"} %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) { diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/cdo/add_21_i8_using_dma_op_with_padding.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/cdo/add_21_i8_using_dma_op_with_padding.mlir index 83bb6a6b2..9a4b4bd9e 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/cdo/add_21_i8_using_dma_op_with_padding.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/cdo/add_21_i8_using_dma_op_with_padding.mlir @@ -2,34 +2,9 @@ module { aie.device(npu1_1col) { - memref.global "public" @objFifo_in0 : memref<16xi8> - memref.global "public" @objFifo_out0 : memref<16xi8> - %tile_0_0 = aie.tile(0, 0) %tile_0_1 = aie.tile(0, 1) - %tile_0_2 = aie.tile(0, 2) - %objFifo_in1_cons_buff_0 = aie.buffer(%tile_0_2) {address = 0 : i32, sym_name = "objFifo_in1_cons_buff_0"} : memref<8xi8> - %objFifo_in1_cons_buff_1 = aie.buffer(%tile_0_2) {address = 8 : i32, sym_name = "objFifo_in1_cons_buff_1"} : memref<8xi8> - %objFifo_out1_buff_0 = aie.buffer(%tile_0_2) {address = 16 : i32, sym_name = "objFifo_out1_buff_0"} : memref<8xi8> - %objFifo_out1_buff_1 = aie.buffer(%tile_0_2) {address = 24 : i32, sym_name = "objFifo_out1_buff_1"} : memref<8xi8> - aie.flow(%tile_0_0, DMA : 0, %tile_0_1, DMA : 0) - aie.flow(%tile_0_1, DMA : 0, %tile_0_2, DMA : 0) - aie.flow(%tile_0_1, DMA : 1, %tile_0_0, DMA : 0) - aie.flow(%tile_0_2, DMA : 0, %tile_0_1, DMA : 1) - aie.shim_dma_allocation @objFifo_in0(MM2S, 0, 0) - func.func @bobsyouruncle(%arg0: memref<64xi8>, %arg1: memref<32xi8>, %arg2: memref<64xi8>) { - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c32_i64 = arith.constant 32 : i64 - %c64_i64 = arith.constant 64 : i64 - aiex.npu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c1_i64, %c32_i64][%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 0 : i64, metadata = @objFifo_in0} : memref<64xi8> - aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c1_i64, %c64_i64][%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 1 : i64, metadata = @objFifo_out0} : memref<64xi8> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} - return - } %objFifo_in0_cons_buff_0 = aie.buffer(%tile_0_1) {address = 0 : i32, sym_name = "objFifo_in0_cons_buff_0"} : memref<16xi8> %objFifo_in0_cons_buff_1 = aie.buffer(%tile_0_1) {address = 16 : i32, sym_name = "objFifo_in0_cons_buff_1"} : memref<16xi8> - %objFifo_out0_buff_0 = aie.buffer(%tile_0_1) {address = 32 : i32, sym_name = "objFifo_out0_buff_0"} : memref<16xi8> - %objFifo_out0_buff_1 = aie.buffer(%tile_0_1) {address = 48 : i32, sym_name = "objFifo_out0_buff_1"} : memref<16xi8> %objFifo_in0_cons_prod_lock = aie.lock(%tile_0_1, 0) {init = 2 : i32, sym_name = "objFifo_in0_cons_prod_lock"} %objFifo_in0_cons_cons_lock = aie.lock(%tile_0_1, 1) {init = 0 : i32, sym_name = "objFifo_in0_cons_cons_lock"} %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) { diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/cdo/add_378_i32_using_dma_op_with_padding.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/cdo/add_378_i32_using_dma_op_with_padding.mlir index 60ca6ec95..bc2d46b5a 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/cdo/add_378_i32_using_dma_op_with_padding.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/cdo/add_378_i32_using_dma_op_with_padding.mlir @@ -2,34 +2,9 @@ module { aie.device(npu1_1col) { - memref.global "public" @objFifo_in0 : memref<16xi32> - memref.global "public" @objFifo_out0 : memref<16xi32> - %tile_0_0 = aie.tile(0, 0) %tile_0_1 = aie.tile(0, 1) - %tile_0_2 = aie.tile(0, 2) - %objFifo_in1_cons_buff_0 = aie.buffer(%tile_0_2) {address = 0 : i32, sym_name = "objFifo_in1_cons_buff_0"} : memref<8xi32> - %objFifo_in1_cons_buff_1 = aie.buffer(%tile_0_2) {address = 32 : i32, sym_name = "objFifo_in1_cons_buff_1"} : memref<8xi32> - %objFifo_out1_buff_0 = aie.buffer(%tile_0_2) {address = 64 : i32, sym_name = "objFifo_out1_buff_0"} : memref<8xi32> - %objFifo_out1_buff_1 = aie.buffer(%tile_0_2) {address = 96 : i32, sym_name = "objFifo_out1_buff_1"} : memref<8xi32> - aie.flow(%tile_0_0, DMA : 0, %tile_0_1, DMA : 0) - aie.flow(%tile_0_1, DMA : 0, %tile_0_2, DMA : 0) - aie.flow(%tile_0_1, DMA : 1, %tile_0_0, DMA : 0) - aie.flow(%tile_0_2, DMA : 0, %tile_0_1, DMA : 1) - aie.shim_dma_allocation @objFifo_in0(MM2S, 0, 0) - func.func @bobsyouruncle(%arg0: memref<64xi32>, %arg1: memref<32xi32>, %arg2: memref<64xi32>) { - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c52_i64 = arith.constant 52 : i64 - %c64_i64 = arith.constant 64 : i64 - aiex.npu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c1_i64, %c52_i64][%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 0 : i64, metadata = @objFifo_in0} : memref<64xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c1_i64, %c64_i64][%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 1 : i64, metadata = @objFifo_out0} : memref<64xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} - return - } %objFifo_in0_cons_buff_0 = aie.buffer(%tile_0_1) {address = 0 : i32, sym_name = "objFifo_in0_cons_buff_0"} : memref<16xi32> %objFifo_in0_cons_buff_1 = aie.buffer(%tile_0_1) {address = 64 : i32, sym_name = "objFifo_in0_cons_buff_1"} : memref<16xi32> - %objFifo_out0_buff_0 = aie.buffer(%tile_0_1) {address = 128 : i32, sym_name = "objFifo_out0_buff_0"} : memref<16xi32> - %objFifo_out0_buff_1 = aie.buffer(%tile_0_1) {address = 192 : i32, sym_name = "objFifo_out0_buff_1"} : memref<16xi32> %objFifo_in0_cons_prod_lock = aie.lock(%tile_0_1, 0) {init = 2 : i32, sym_name = "objFifo_in0_cons_prod_lock"} %objFifo_in0_cons_cons_lock = aie.lock(%tile_0_1, 1) {init = 0 : i32, sym_name = "objFifo_in0_cons_cons_lock"} %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) { diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/cdo/bad_padding.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/cdo/bad_padding.mlir new file mode 100644 index 000000000..196667049 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/cdo/bad_padding.mlir @@ -0,0 +1,71 @@ +// RUN: iree-opt --verify-diagnostics --split-input-file %s + +module { + aie.device(xcve2802) { + %t1 = aie.tile(1, 1) + %buf = aie.buffer(%t1) : memref<256xi8> + %mem = aie.memtile_dma(%t1) { + aie.dma_start("MM2S", 0, ^bd0, ^end) + ^bd0: + // expected-error@+1 {{'aie.dma_bd' op Inner-most padding-before count must result in padding in 32-bit words.}} + aie.dma_bd(%buf : memref<256xi8>, 0, 8, [], [], pad_value = 0) + aie.next_bd ^end + ^end: + aie.end + } + } +} + +// ----- + +module { + aie.device(xcve2802) { + %t1 = aie.tile(1, 1) + %buf = aie.buffer(%t1) : memref<256xi32> + %mem = aie.memtile_dma(%t1) { + aie.dma_start("MM2S", 0, ^bd0, ^end) + ^bd0: + // expected-error@+1 {{'aie.dma_bd' op Data exceeds len after padding.}} + aie.dma_bd(%buf : memref<256xi32>, 0, 4, [], [], pad_value = 0) + aie.next_bd ^end + ^end: + aie.end + } + } +} + +// ----- + +module { + aie.device(xcve2802) { + %t1 = aie.tile(1, 1) + %buf = aie.buffer(%t1) : memref<256xbf16> + %mem = aie.memtile_dma(%t1) { + aie.dma_start("MM2S", 0, ^bd0, ^end) + ^bd0: + // expected-error@+1 {{'aie.dma_bd' op Inner-most padding-before count must result in padding in 32-bit words.}} + aie.dma_bd(%buf : memref<256xbf16>, 0, 256, [], [], pad_value = 0) + aie.next_bd ^end + ^end: + aie.end + } + } +} + +// ----- + +module { + aie.device(xcve2802) { + %t1 = aie.tile(1, 1) + %buf = aie.buffer(%t1) : memref<256xbf16> + %mem = aie.memtile_dma(%t1) { + aie.dma_start("MM2S", 0, ^bd0, ^end) + ^bd0: + // expected-error@+1 {{'aie.dma_bd' op Inner-most padding-after count must result in padding in 32-bit words.}} + aie.dma_bd(%buf : memref<256xbf16>, 0, 256, [], [], pad_value = 0) + aie.next_bd ^end + ^end: + aie.end + } + } +}