diff --git a/codegen/compiler/src/Quidditch/Dialect/Snitch/IR/QuidditchSnitchOps.cpp b/codegen/compiler/src/Quidditch/Dialect/Snitch/IR/QuidditchSnitchOps.cpp
index a14b822..fe364a9 100644
--- a/codegen/compiler/src/Quidditch/Dialect/Snitch/IR/QuidditchSnitchOps.cpp
+++ b/codegen/compiler/src/Quidditch/Dialect/Snitch/IR/QuidditchSnitchOps.cpp
@@ -636,7 +636,7 @@ StartTensorCopyOp::bufferize(RewriterBase &rewriter,
   // Zero out the entire buffer prior to overwriting it with the copied values.
   // TODO: This could be optimized to only zero regions that won't be filled
   //  with the copied values at the cost of 2^rank transfers instead of two.
-  if (hasPadding())
+  if (hasPadding() && !getUndefPadding())
     rewriter.create<StartZeroMemTransferOp>(getLoc(), *alloc);
 
   // Subview into the original memory without any padding.
diff --git a/codegen/compiler/src/Quidditch/Dialect/Snitch/IR/QuidditchSnitchOps.td b/codegen/compiler/src/Quidditch/Dialect/Snitch/IR/QuidditchSnitchOps.td
index e634b45..6c1b9a7 100644
--- a/codegen/compiler/src/Quidditch/Dialect/Snitch/IR/QuidditchSnitchOps.td
+++ b/codegen/compiler/src/Quidditch/Dialect/Snitch/IR/QuidditchSnitchOps.td
@@ -199,7 +199,8 @@ def QuidditchSnitch_StartTensorCopyOp : QuidditchSnitch_Op<"start_tensor_copy",
 
   let arguments = (ins AnyRankedTensor:$copy,
                        Variadic<Index>:$high_pad,
-                       OptionalAttr<DenseI64ArrayAttr>:$static_high_pad
+                       OptionalAttr<DenseI64ArrayAttr>:$static_high_pad,
+                       UnitAttr:$undef_padding
   );
 
   let results = (outs
@@ -209,7 +210,7 @@ def QuidditchSnitch_StartTensorCopyOp : QuidditchSnitch_Op<"start_tensor_copy",
 
   let assemblyFormat = [{
     $copy `to` `L1`
-    ( `pad` `with` `zero` `to`
+    ( `pad` `with` (`undef` $undef_padding^) : (`zero`)? `to`
       custom<DynamicIndexList>($high_pad, $static_high_pad)^)?
     `:` type($copy) `->` type($result) attr-dict
   }];
diff --git a/codegen/compiler/src/Quidditch/Dialect/Snitch/Transforms/PromoteToL1.cpp b/codegen/compiler/src/Quidditch/Dialect/Snitch/Transforms/PromoteToL1.cpp
index 2a5129b..a99a932 100644
--- a/codegen/compiler/src/Quidditch/Dialect/Snitch/Transforms/PromoteToL1.cpp
+++ b/codegen/compiler/src/Quidditch/Dialect/Snitch/Transforms/PromoteToL1.cpp
@@ -101,17 +101,20 @@ void PromotePadsToL1::runOnOperation() {
     if (!constant)
       return;
 
-    // 'start_tensor_copy' only supports zero-padding right now.
+    // 'start_tensor_copy' supports zero-padding and undef-padding right now.
     // Poison (undef) can also be lowered to perform zero-padding.
     if (!matchPattern(constant, m_NonZero()) &&
         !matchPattern(constant, m_PosZeroFloat()) &&
         !matchPattern(constant, m_Constant<ub::PoisonAttr>(nullptr)))
       return;
+    bool undefPadding =
+        matchPattern(constant, m_Constant<ub::PoisonAttr>(nullptr));
 
     OpBuilder builder(padOp);
     auto copyOp = builder.create<StartTensorCopyOp>(
         padOp.getLoc(), padOp.getType(), builder.getType<DMATokenType>(),
-        padOp.getSource(), padOp.getHigh(), padOp.getStaticHighAttr());
+        padOp.getSource(), padOp.getHigh(), padOp.getStaticHighAttr(),
+        undefPadding);
     auto waitOp = builder.create<WaitForTensorCopyOp>(
         padOp.getLoc(), copyOp.getResult(), copyOp.getToken(),
         /*copy=*/padOp.getSource());
diff --git a/codegen/tests/Dialect/Snitch/IR/bufferization.mlir b/codegen/tests/Dialect/Snitch/IR/bufferization.mlir
index 7aa9088..97d07c3 100644
--- a/codegen/tests/Dialect/Snitch/IR/bufferization.mlir
+++ b/codegen/tests/Dialect/Snitch/IR/bufferization.mlir
@@ -165,3 +165,26 @@ func.func @tensor_copy_pad(%arg0 : tensor<?x?xf32>, %pad0 : index, %pad1 : index
   // CHECK: return %[[TENSOR]], %[[TOKEN]]
   return %r, %t : tensor<?x?xf32>, !quidditch_snitch.dma_token
 }
+
+// CHECK-LABEL: @tensor_copy_pad_undef
+// CHECK-SAME: %[[ARG0:[[:alnum:]]+]]
+// CHECK-SAME: %[[PAD0:[[:alnum:]]+]]
+// CHECK-SAME: %[[PAD1:[[:alnum:]]+]]
+func.func @tensor_copy_pad_undef(%arg0 : tensor<?x?xf32>, %pad0 : index, %pad1 : index) -> (tensor<?x?xf32>, !quidditch_snitch.dma_token) {
+  // CHECK: %[[COPY:.*]] = bufferization.to_memref %[[ARG0]]
+  // CHECK: %[[ZERO:.*]] = arith.constant 0
+  // CHECK: %[[DIM0:.*]] = memref.dim %[[COPY]], %[[ZERO]]
+  // CHECK: %[[ONE:.*]] = arith.constant 1
+  // CHECK: %[[DIM1:.*]] = memref.dim %[[COPY]], %[[ONE]]
+  // CHECK: %[[NEW_DIM0:.*]] = affine.apply #[[$MAP2]]()[%[[DIM0]], %[[PAD0]]]
+  // CHECK: %[[NEW_DIM1:.*]] = affine.apply #[[$MAP2]]()[%[[DIM1]], %[[PAD1]]]
+  // CHECK: %[[ALLOC:.*]] = memref.alloc(%[[NEW_DIM0]], %[[NEW_DIM1]])
+  // CHECK-NOT: start_zero_mem_transfer
+  // CHECK: %[[UNPADDED:.*]] = memref.subview %[[ALLOC]][0, 0] [%[[DIM0]], %[[DIM1]]] [1, 1]
+  // CHECK-NEXT: %[[TOKEN:.*]] = quidditch_snitch.start_dma_transfer from %[[COPY]]
+  // CHECK-SAME: to %[[UNPADDED]]
+  %r, %t = quidditch_snitch.start_tensor_copy %arg0 to L1 pad with undef to [%pad0, %pad1] : tensor<?x?xf32> -> tensor<?x?xf32>
+  // CHECK: %[[TENSOR:.*]] = bufferization.to_tensor %[[ALLOC]]
+  // CHECK: return %[[TENSOR]], %[[TOKEN]]
+  return %r, %t : tensor<?x?xf32>, !quidditch_snitch.dma_token
+}
diff --git a/codegen/tests/Dialect/Snitch/Transforms/promote-pads-to-l1.mlir b/codegen/tests/Dialect/Snitch/Transforms/promote-pads-to-l1.mlir
index de95548..4359b9e 100644
--- a/codegen/tests/Dialect/Snitch/Transforms/promote-pads-to-l1.mlir
+++ b/codegen/tests/Dialect/Snitch/Transforms/promote-pads-to-l1.mlir
@@ -22,7 +22,7 @@ func.func @test_zero_f32(%a : tensor<32x32xf32>) -> tensor<33x33xf32> {
 func.func @test_poison(%a : tensor<32x32xf32>) -> tensor<33x33xf32> {
   %c = ub.poison : f32
   // CHECK: %[[R:.*]], %[[T:.*]] = quidditch_snitch.start_tensor_copy %[[A]]
-  // CHECK-SAME: pad with zero to [1, 1]
+  // CHECK-SAME: pad with undef to [1, 1]
   // CHECK: %[[R2:.*]] = quidditch_snitch.wait_for_tensor_copy of %[[A]]
   // CHECK-SAME: to %[[R]]
   // CHECK-SAME: using %[[T]]