diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AMDAIETargetCDODirect.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AMDAIETargetCDODirect.cpp
index e4ea07a95..3620e79b5 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AMDAIETargetCDODirect.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AMDAIETargetCDODirect.cpp
@@ -159,34 +159,45 @@ LogicalResult configureBdInBlock(XAie_DmaDesc &dmaTileBd, Block &block,
   std::optional<llvm::ArrayRef<BDDimLayoutAttr>> dims = bdOp.getDimensions();
   int lenInBytes = bdOp.getLenInBytes();
   int basePlusOffsetInBytes = baseAddr + bdOp.getOffsetInBytes();
+  int32_t bufferElementTypeWidthInBytes =
+      bdOp.getBufferElementTypeWidthInBytes();
+  // aie-rt expects multiples of 32b words (see docstring on
+  // XAie_DmaSetMultiDimAddr). Thus, elementWidthIn32bWords is possibly a
+  // fraction, e.g. bf16 => elementWidthIn32bWords == 0.5 so that size = 10 => 5
+  // 32b words
+  double elementWidthIn32bWords =
+      static_cast<double>(bufferElementTypeWidthInBytes) / 4.0;
+
   if (!dims) {
     TRY_XAIE_API_EMIT_ERROR(bdOp, XAie_DmaSetAddrLen, &dmaTileBd,
                             basePlusOffsetInBytes, lenInBytes);
   } else {
     XAie_DmaTensor dmaTileBdTensor = {};
     dmaTileBdTensor.NumDim = dims->size();
-    dmaTileBdTensor.Dim = static_cast<XAie_DmaDimDesc *>(
-        calloc(dmaTileBdTensor.NumDim, sizeof(XAie_DmaDimDesc)));
-    if (!dmaTileBdTensor.Dim)
-      return bdOp.emitError("couldn't allocate array of XAie_DmaDimDesc");
-    // libxaie requires stride in multiples of 32b
-    double elementWidthIn32bWords =
-        static_cast<double>(bdOp.getBufferElementTypeWidthInBytes()) / 4.0;
+    dmaTileBdTensor.Dim = new XAie_DmaDimDesc[dmaTileBdTensor.NumDim];
     for (size_t i = 0; i < dims->size(); i++) {
       // Pass down dimensions in reverse order; in the MLIR, this allows
-      // us to specify step sizes/wraps in the same order as we would
-      // access a multi-dim C array, with the highest dimension first.
-      int j = dims->size() - i - 1;
-      uint16_t size;
-      uint32_t stride;
+      // us to specify step sizes/strides in the same order as we would for
+      // RankedTensorType/MemRefType.
+      uint16_t size = dims.value()[i].getSize();
+      uint32_t stride = dims.value()[i].getStride();
+      size_t j = dims->size() - i - 1;
       if (j > 0) {
-        stride = static_cast<uint32_t>(dims.value()[i].getStride() *
-                                       elementWidthIn32bWords);
-        size = dims.value()[i].getSize();
+        if (stride * bufferElementTypeWidthInBytes % 4 != 0) {
+          return bdOp.emitOpError("`stride` on dim ")
+                 << i
+                 << ", times element width (in bytes), should "
+                    "be a multiple of 4 bytes";
+        }
+        stride = static_cast<uint32_t>(stride * elementWidthIn32bWords);
       } else {
-        stride = dims.value()[i].getStride();
-        size = static_cast<uint16_t>(dims.value()[i].getSize() *
-                                     elementWidthIn32bWords);
+        if (size * bufferElementTypeWidthInBytes % 4 != 0) {
+          return bdOp.emitOpError("`size` on dim ")
+                 << i
+                 << ", times element width (in bytes), should "
+                    "be a multiple of 4 bytes";
+        }
+        size = static_cast<uint16_t>(size * elementWidthIn32bWords);
       }
       stride = stride > 0 ? stride : 1;
       // Assume AIE-ML architecture (ie use AieMlDimDesc instead of AieDimDesc);
@@ -204,26 +215,24 @@ LogicalResult configureBdInBlock(XAie_DmaDesc &dmaTileBd, Block &block,
   if (padDims) {
     XAie_DmaPadTensor dmaPadTensor = {};
     dmaPadTensor.NumDim = padDims->size();
-    dmaPadTensor.PadDesc = static_cast<XAie_PadDesc *>(
-        calloc(dmaPadTensor.NumDim, sizeof(XAie_PadDesc)));
-    if (!dmaPadTensor.PadDesc)
-      return bdOp.emitError("couldn't allocate array of XAie_PadDesc");
-    // libxaie requires stride in multiples of 32b
-    double elementWidthIn32bWords =
-        static_cast<double>(bdOp.getBufferElementTypeWidthInBytes()) / 4.0;
+    dmaPadTensor.PadDesc = new XAie_PadDesc[dmaPadTensor.NumDim];
     for (size_t i = 0; i < padDims->size(); i++) {
-      // Pass down dimensions in reverse order.
-      int j = padDims->size() - i - 1;
-      uint8_t before;
-      uint8_t after;
-      if (j > 0) {
-        before = static_cast<uint8_t>(padDims.value()[i].getConstPadBefore());
-        after = static_cast<uint8_t>(padDims.value()[i].getConstPadAfter());
-      } else {
-        before = static_cast<uint8_t>(padDims.value()[i].getConstPadBefore() *
-                                      elementWidthIn32bWords);
-        after = static_cast<uint8_t>(padDims.value()[i].getConstPadAfter() *
-                                     elementWidthIn32bWords);
+      uint8_t before = padDims.value()[i].getConstPadBefore();
+      uint8_t after = padDims.value()[i].getConstPadAfter();
+      size_t j = padDims->size() - i - 1;
+      if (j == 0) {
+        if (before * bufferElementTypeWidthInBytes % 4 != 0) {
+          return bdOp.emitOpError(
+              "`before` padding on inner-most dim, times element width (in "
+              "bytes), should be a multiple of 4 bytes");
+        }
+        if (after * bufferElementTypeWidthInBytes % 4 != 0) {
+          return bdOp.emitOpError(
+              "`after` padding on inner-most dim, times element width (in "
+              "bytes), should be a multiple of 4 bytes");
+        }
+        before = static_cast<uint8_t>(before * elementWidthIn32bWords);
+        after = static_cast<uint8_t>(after * elementWidthIn32bWords);
       }
       dmaPadTensor.PadDesc[j] = {before, after};
     }
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/cdo/add_12_i8_using_2d_dma_op_with_padding.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/cdo/add_12_i8_using_2d_dma_op_with_padding.mlir
index 61da0335c..745f4e348 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/cdo/add_12_i8_using_2d_dma_op_with_padding.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/cdo/add_12_i8_using_2d_dma_op_with_padding.mlir
@@ -2,35 +2,8 @@
 
 module {
   aie.device(npu1_1col) {
-    memref.global "public" @objFifo_in0 : memref<56x56xi8>
-    memref.global "public" @objFifo_out0 : memref<64x64xi8>
-    %tile_0_0 = aie.tile(0, 0)
     %tile_0_1 = aie.tile(0, 1)
-    %tile_0_2 = aie.tile(0, 2)
-    %objFifo_in1_cons_buff_0 = aie.buffer(%tile_0_2) {address = 0 : i32, sym_name = "objFifo_in1_cons_buff_0"} : memref<64x64xi8>
-    %objFifo_in1_cons_buff_1 = aie.buffer(%tile_0_2) {address = 4096 : i32, sym_name = "objFifo_in1_cons_buff_1"} : memref<64x64xi8>
-    %objFifo_out1_buff_0 = aie.buffer(%tile_0_2) {address = 8192 : i32, sym_name = "objFifo_out1_buff_0"} : memref<64x64xi8>
-    %objFifo_out1_buff_1 = aie.buffer(%tile_0_2) {address = 12288 : i32, sym_name = "objFifo_out1_buff_1"} : memref<64x64xi8>
-    aie.flow(%tile_0_0, DMA : 0, %tile_0_1, DMA : 0)
-    aie.flow(%tile_0_1, DMA : 0, %tile_0_2, DMA : 0)
-    aie.flow(%tile_0_1, DMA : 1, %tile_0_0, DMA : 0)
-    aie.flow(%tile_0_2, DMA : 0, %tile_0_1, DMA : 1)
-    aie.shim_dma_allocation @objFifo_in0(MM2S, 0, 0)
-    func.func @bobsyouruncle(%arg0: memref<61x56xi8>, %arg1: memref<32xi8>, %arg2: memref<64x64xi8>) {
-      %c0_i64 = arith.constant 0 : i64
-      %c1_i64 = arith.constant 1 : i64
-      %c56_i64 = arith.constant 56 : i64
-      %c61_i64 = arith.constant 61 : i64
-      %c64_i64 = arith.constant 64 : i64
-      aiex.npu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c61_i64, %c56_i64][%c0_i64, %c0_i64, %c56_i64, %c1_i64]) {id = 0 : i64, metadata = @objFifo_in0} : memref<61x56xi8>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c0_i64, %c64_i64, %c1_i64]) {id = 1 : i64, metadata = @objFifo_out0} : memref<64x64xi8>
-      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
-      return
-    }
     %objFifo_in0_cons_buff_0 = aie.buffer(%tile_0_1) {address = 0 : i32, sym_name = "objFifo_in0_cons_buff_0"} : memref<64x64xi8>
-    %objFifo_in0_cons_buff_1 = aie.buffer(%tile_0_1) {address = 4096 : i32, sym_name = "objFifo_in0_cons_buff_1"} : memref<64x64xi8>
-    %objFifo_out0_buff_0 = aie.buffer(%tile_0_1) {address = 8192 : i32, sym_name = "objFifo_out0_buff_0"} : memref<64x64xi8>
-    %objFifo_out0_buff_1 = aie.buffer(%tile_0_1) {address = 12288 : i32, sym_name = "objFifo_out0_buff_1"} : memref<64x64xi8>
     %objFifo_in0_cons_prod_lock = aie.lock(%tile_0_1, 0) {init = 1 : i32, sym_name = "objFifo_in0_cons_prod_lock"}
     %objFifo_in0_cons_cons_lock = aie.lock(%tile_0_1, 1) {init = 0 : i32, sym_name = "objFifo_in0_cons_cons_lock"}
     %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) {
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/cdo/add_21_i8_using_dma_op_with_padding.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/cdo/add_21_i8_using_dma_op_with_padding.mlir
index 83bb6a6b2..9a4b4bd9e 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/cdo/add_21_i8_using_dma_op_with_padding.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/cdo/add_21_i8_using_dma_op_with_padding.mlir
@@ -2,34 +2,9 @@
 
 module {
   aie.device(npu1_1col) {
-    memref.global "public" @objFifo_in0 : memref<16xi8>
-    memref.global "public" @objFifo_out0 : memref<16xi8>
-    %tile_0_0 = aie.tile(0, 0)
     %tile_0_1 = aie.tile(0, 1)
-    %tile_0_2 = aie.tile(0, 2)
-    %objFifo_in1_cons_buff_0 = aie.buffer(%tile_0_2) {address = 0 : i32, sym_name = "objFifo_in1_cons_buff_0"} : memref<8xi8>
-    %objFifo_in1_cons_buff_1 = aie.buffer(%tile_0_2) {address = 8 : i32, sym_name = "objFifo_in1_cons_buff_1"} : memref<8xi8>
-    %objFifo_out1_buff_0 = aie.buffer(%tile_0_2) {address = 16 : i32, sym_name = "objFifo_out1_buff_0"} : memref<8xi8>
-    %objFifo_out1_buff_1 = aie.buffer(%tile_0_2) {address = 24 : i32, sym_name = "objFifo_out1_buff_1"} : memref<8xi8>
-    aie.flow(%tile_0_0, DMA : 0, %tile_0_1, DMA : 0)
-    aie.flow(%tile_0_1, DMA : 0, %tile_0_2, DMA : 0)
-    aie.flow(%tile_0_1, DMA : 1, %tile_0_0, DMA : 0)
-    aie.flow(%tile_0_2, DMA : 0, %tile_0_1, DMA : 1)
-    aie.shim_dma_allocation @objFifo_in0(MM2S, 0, 0)
-    func.func @bobsyouruncle(%arg0: memref<64xi8>, %arg1: memref<32xi8>, %arg2: memref<64xi8>) {
-      %c0_i64 = arith.constant 0 : i64
-      %c1_i64 = arith.constant 1 : i64
-      %c32_i64 = arith.constant 32 : i64
-      %c64_i64 = arith.constant 64 : i64
-      aiex.npu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c1_i64, %c32_i64][%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 0 : i64, metadata = @objFifo_in0} : memref<64xi8>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c1_i64, %c64_i64][%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 1 : i64, metadata = @objFifo_out0} : memref<64xi8>
-      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
-      return
-    }
     %objFifo_in0_cons_buff_0 = aie.buffer(%tile_0_1) {address = 0 : i32, sym_name = "objFifo_in0_cons_buff_0"} : memref<16xi8>
     %objFifo_in0_cons_buff_1 = aie.buffer(%tile_0_1) {address = 16 : i32, sym_name = "objFifo_in0_cons_buff_1"} : memref<16xi8>
-    %objFifo_out0_buff_0 = aie.buffer(%tile_0_1) {address = 32 : i32, sym_name = "objFifo_out0_buff_0"} : memref<16xi8>
-    %objFifo_out0_buff_1 = aie.buffer(%tile_0_1) {address = 48 : i32, sym_name = "objFifo_out0_buff_1"} : memref<16xi8>
     %objFifo_in0_cons_prod_lock = aie.lock(%tile_0_1, 0) {init = 2 : i32, sym_name = "objFifo_in0_cons_prod_lock"}
     %objFifo_in0_cons_cons_lock = aie.lock(%tile_0_1, 1) {init = 0 : i32, sym_name = "objFifo_in0_cons_cons_lock"}
     %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) {
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/cdo/add_378_i32_using_dma_op_with_padding.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/cdo/add_378_i32_using_dma_op_with_padding.mlir
index 60ca6ec95..bc2d46b5a 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/cdo/add_378_i32_using_dma_op_with_padding.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/cdo/add_378_i32_using_dma_op_with_padding.mlir
@@ -2,34 +2,9 @@
 
 module {
   aie.device(npu1_1col) {
-    memref.global "public" @objFifo_in0 : memref<16xi32>
-    memref.global "public" @objFifo_out0 : memref<16xi32>
-    %tile_0_0 = aie.tile(0, 0)
     %tile_0_1 = aie.tile(0, 1)
-    %tile_0_2 = aie.tile(0, 2)
-    %objFifo_in1_cons_buff_0 = aie.buffer(%tile_0_2) {address = 0 : i32, sym_name = "objFifo_in1_cons_buff_0"} : memref<8xi32>
-    %objFifo_in1_cons_buff_1 = aie.buffer(%tile_0_2) {address = 32 : i32, sym_name = "objFifo_in1_cons_buff_1"} : memref<8xi32>
-    %objFifo_out1_buff_0 = aie.buffer(%tile_0_2) {address = 64 : i32, sym_name = "objFifo_out1_buff_0"} : memref<8xi32>
-    %objFifo_out1_buff_1 = aie.buffer(%tile_0_2) {address = 96 : i32, sym_name = "objFifo_out1_buff_1"} : memref<8xi32>
-    aie.flow(%tile_0_0, DMA : 0, %tile_0_1, DMA : 0)
-    aie.flow(%tile_0_1, DMA : 0, %tile_0_2, DMA : 0)
-    aie.flow(%tile_0_1, DMA : 1, %tile_0_0, DMA : 0)
-    aie.flow(%tile_0_2, DMA : 0, %tile_0_1, DMA : 1)
-    aie.shim_dma_allocation @objFifo_in0(MM2S, 0, 0)
-    func.func @bobsyouruncle(%arg0: memref<64xi32>, %arg1: memref<32xi32>, %arg2: memref<64xi32>) {
-      %c0_i64 = arith.constant 0 : i64
-      %c1_i64 = arith.constant 1 : i64
-      %c52_i64 = arith.constant 52 : i64
-      %c64_i64 = arith.constant 64 : i64
-      aiex.npu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c1_i64, %c52_i64][%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 0 : i64, metadata = @objFifo_in0} : memref<64xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c1_i64, %c64_i64][%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 1 : i64, metadata = @objFifo_out0} : memref<64xi32>
-      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
-      return
-    }
     %objFifo_in0_cons_buff_0 = aie.buffer(%tile_0_1) {address = 0 : i32, sym_name = "objFifo_in0_cons_buff_0"} : memref<16xi32>
     %objFifo_in0_cons_buff_1 = aie.buffer(%tile_0_1) {address = 64 : i32, sym_name = "objFifo_in0_cons_buff_1"} : memref<16xi32>
-    %objFifo_out0_buff_0 = aie.buffer(%tile_0_1) {address = 128 : i32, sym_name = "objFifo_out0_buff_0"} : memref<16xi32>
-    %objFifo_out0_buff_1 = aie.buffer(%tile_0_1) {address = 192 : i32, sym_name = "objFifo_out0_buff_1"} : memref<16xi32>
     %objFifo_in0_cons_prod_lock = aie.lock(%tile_0_1, 0) {init = 2 : i32, sym_name = "objFifo_in0_cons_prod_lock"}
     %objFifo_in0_cons_cons_lock = aie.lock(%tile_0_1, 1) {init = 0 : i32, sym_name = "objFifo_in0_cons_cons_lock"}
     %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) {
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/cdo/bad_padding.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/cdo/bad_padding.mlir
new file mode 100644
index 000000000..196667049
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/cdo/bad_padding.mlir
@@ -0,0 +1,71 @@
+// RUN: iree-opt --verify-diagnostics --split-input-file %s
+
+module {
+  aie.device(xcve2802) {
+    %t1 = aie.tile(1, 1)
+    %buf = aie.buffer(%t1) : memref<256xi8>
+    %mem = aie.memtile_dma(%t1) {
+      aie.dma_start("MM2S", 0, ^bd0, ^end)
+      ^bd0:
+        // expected-error@+1 {{'aie.dma_bd' op Inner-most padding-before count must result in padding in 32-bit words.}}
+        aie.dma_bd(%buf : memref<256xi8>, 0, 8, [<size = 4, stride = 1>], [<const_pad_before = 2, const_pad_after = 2>], pad_value = 0)
+        aie.next_bd ^end
+      ^end:
+        aie.end
+    }
+  }
+}
+
+// -----
+
+module {
+  aie.device(xcve2802) {
+    %t1 = aie.tile(1, 1)
+    %buf = aie.buffer(%t1) : memref<256xi32>
+    %mem = aie.memtile_dma(%t1) {
+      aie.dma_start("MM2S", 0, ^bd0, ^end)
+      ^bd0:
+        // expected-error@+1 {{'aie.dma_bd' op Data exceeds len after padding.}}
+        aie.dma_bd(%buf : memref<256xi32>, 0, 4, [<size = 2, stride = 128>], [<const_pad_before = 2, const_pad_after = 1>], pad_value = 0)
+        aie.next_bd ^end
+      ^end:
+        aie.end
+    }
+  }
+}
+
+// -----
+
+module {
+  aie.device(xcve2802) {
+    %t1 = aie.tile(1, 1)
+    %buf = aie.buffer(%t1) : memref<256xbf16>
+    %mem = aie.memtile_dma(%t1) {
+      aie.dma_start("MM2S", 0, ^bd0, ^end)
+      ^bd0:
+        // expected-error@+1 {{'aie.dma_bd' op Inner-most padding-before count must result in padding in 32-bit words.}}
+        aie.dma_bd(%buf : memref<256xbf16>, 0, 256, [<size = 4, stride = 1>], [<const_pad_before = 3, const_pad_after = 2>], pad_value = 0)
+        aie.next_bd ^end
+      ^end:
+        aie.end
+    }
+  }
+}
+
+// -----
+
+module {
+  aie.device(xcve2802) {
+    %t1 = aie.tile(1, 1)
+    %buf = aie.buffer(%t1) : memref<256xbf16>
+    %mem = aie.memtile_dma(%t1) {
+      aie.dma_start("MM2S", 0, ^bd0, ^end)
+      ^bd0:
+        // expected-error@+1 {{'aie.dma_bd' op Inner-most padding-after count must result in padding in 32-bit words.}}
+        aie.dma_bd(%buf : memref<256xbf16>, 0, 256, [<size = 4, stride = 1>], [<const_pad_before = 2, const_pad_after = 3>], pad_value = 0)
+        aie.next_bd ^end
+      ^end:
+        aie.end
+    }
+  }
+}