diff --git a/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMAOps.cpp b/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMAOps.cpp
index 1208ad0..913bfab 100644
--- a/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMAOps.cpp
+++ b/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMAOps.cpp
@@ -290,8 +290,14 @@ StartTensorCopyOp::bufferize(RewriterBase &rewriter,
   // Zero out the entire buffer prior to overwriting it with the copied values.
   // TODO: This could be optimized to only zero regions that won't be filled
   //  with the copied values at the cost of 2^rank transfers instead of two.
-  if (hasPadding() && !getUndefPadding())
-    rewriter.create<StartZeroMemTransferOp>(getLoc(), *alloc);
+  if (hasPadding() && !getUndefPadding()) {
+    Value token = rewriter.create<StartZeroMemTransferOp>(getLoc(), *alloc);
+    // TODO: This wait is required as the zero mem transfer writes to the same
+    //       memory as the transfer below. This is currently unspecified
+    //       behaviour (both in the DMA dialect and in Snitch as far as we
+    //       know).
+    rewriter.create<WaitForTransfersOp>(getLoc(), token);
+  }
 
   // Subview into the original memory without any padding.
   // As we only add padding at the end of the dimensions, the offsets are always
diff --git a/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMAOps.td b/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMAOps.td
index b5f23fd..7307326 100644
--- a/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMAOps.td
+++ b/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMAOps.td
@@ -133,7 +133,12 @@ def DMA_StartTransferOp : DMA_Op<"start_transfer",
 
     The DMA operation is likely (but not guaranteed) to run asynchronous and
     its completion only guaranteed by executing the `wait_for_transfers`
-    operation with the token returned by this operation or a later one.
+    operation with the token returned by this operation.
+
+    Due to the unspecified order and concurrency of transfers, the resulting
+    state of a MemRef is unspecified if at any point two transfers not-yet
+    completed transfers exist that either write to the same memory location
+    or writes to a memory location read by another transfer.
   }];
 
   let arguments = (ins
@@ -154,7 +159,13 @@ def DMA_StartZeroMemTransferOp : DMA_Op<"start_zero_mem_transfer",
   [MemoryEffects<[MemWrite]>]> {
 
   let description = [{
+    Starts a DMA transfer which when completed has filled the given MemRef
+    entirely with bit of 0.
+    I.e. this is equal to C's `memset` with zero, but asynchronous.
 
+    The semantics are identical to a `start_transfer` operation where the
+    source is a MemRef identical in shape to `filled` consisting of just
+    0 bits.
   }];
 
   let arguments = (ins
diff --git a/codegen/tests/Dialect/DMA/IR/bufferization.mlir b/codegen/tests/Dialect/DMA/IR/bufferization.mlir
index c6843fc..c3a1802 100644
--- a/codegen/tests/Dialect/DMA/IR/bufferization.mlir
+++ b/codegen/tests/Dialect/DMA/IR/bufferization.mlir
@@ -91,7 +91,8 @@ func.func @tensor_copy_pad(%arg0 : tensor<?x?xf32>, %pad0 : index, %pad1 : index
   // CHECK: %[[NEW_DIM0:.*]] = affine.apply #[[$MAP2]]()[%[[DIM0]], %[[PAD0]]]
   // CHECK: %[[NEW_DIM1:.*]] = affine.apply #[[$MAP2]]()[%[[DIM1]], %[[PAD1]]]
   // CHECK: %[[ALLOC:.*]] = memref.alloc(%[[NEW_DIM0]], %[[NEW_DIM1]])
-  // CHECK: start_zero_mem_transfer %[[ALLOC]]
+  // CHECK: %[[ZERO_TOKEN:.*]] = dma.start_zero_mem_transfer %[[ALLOC]]
+  // CHECK: dma.wait_for_transfers %[[ZERO_TOKEN]]
   // CHECK: %[[UNPADDED:.*]] = memref.subview %[[ALLOC]][0, 0] [%[[DIM0]], %[[DIM1]]] [1, 1]
   // CHECK: %[[TOKEN:.*]] = dma.start_transfer from %[[COPY]]
   // CHECK-SAME: to %[[UNPADDED]]