diff --git a/codegen/compiler/src/Quidditch/Dialect/Snitch/IR/QuidditchSnitchOps.cpp b/codegen/compiler/src/Quidditch/Dialect/Snitch/IR/QuidditchSnitchOps.cpp index a14b822..fe364a9 100644 --- a/codegen/compiler/src/Quidditch/Dialect/Snitch/IR/QuidditchSnitchOps.cpp +++ b/codegen/compiler/src/Quidditch/Dialect/Snitch/IR/QuidditchSnitchOps.cpp @@ -636,7 +636,7 @@ StartTensorCopyOp::bufferize(RewriterBase &rewriter, // Zero out the entire buffer prior to overwriting it with the copied values. // TODO: This could be optimized to only zero regions that won't be filled // with the copied values at the cost of 2^rank transfers instead of two. - if (hasPadding()) + if (hasPadding() && !getUndefPadding()) rewriter.create(getLoc(), *alloc); // Subview into the original memory without any padding. diff --git a/codegen/compiler/src/Quidditch/Dialect/Snitch/IR/QuidditchSnitchOps.td b/codegen/compiler/src/Quidditch/Dialect/Snitch/IR/QuidditchSnitchOps.td index e634b45..6c1b9a7 100644 --- a/codegen/compiler/src/Quidditch/Dialect/Snitch/IR/QuidditchSnitchOps.td +++ b/codegen/compiler/src/Quidditch/Dialect/Snitch/IR/QuidditchSnitchOps.td @@ -199,7 +199,8 @@ def QuidditchSnitch_StartTensorCopyOp : QuidditchSnitch_Op<"start_tensor_copy", let arguments = (ins AnyRankedTensor:$copy, Variadic:$high_pad, - OptionalAttr:$static_high_pad + OptionalAttr:$static_high_pad, + UnitAttr:$undef_padding ); let results = (outs @@ -209,7 +210,7 @@ def QuidditchSnitch_StartTensorCopyOp : QuidditchSnitch_Op<"start_tensor_copy", let assemblyFormat = [{ $copy `to` `L1` - ( `pad` `with` `zero` `to` + ( `pad` `with` (`undef` $undef_padding^) : (`zero`)? `to` custom($high_pad, $static_high_pad)^)? `:` type($copy) `->` type($result) attr-dict }]; diff --git a/codegen/compiler/src/Quidditch/Dialect/Snitch/Transforms/PromoteToL1.cpp b/codegen/compiler/src/Quidditch/Dialect/Snitch/Transforms/PromoteToL1.cpp index 2a5129b..a99a932 100644 --- a/codegen/compiler/src/Quidditch/Dialect/Snitch/Transforms/PromoteToL1.cpp +++ b/codegen/compiler/src/Quidditch/Dialect/Snitch/Transforms/PromoteToL1.cpp @@ -101,17 +101,20 @@ void PromotePadsToL1::runOnOperation() { if (!constant) return; - // 'start_tensor_copy' only supports zero-padding right now. + // 'start_tensor_copy' supports zero-padding and undef-padding right now. // Poison (undef) can also be lowered to perform zero-padding. if (!matchPattern(constant, m_NonZero()) && !matchPattern(constant, m_PosZeroFloat()) && !matchPattern(constant, m_Constant(nullptr))) return; + bool undefPadding = + matchPattern(constant, m_Constant(nullptr)); OpBuilder builder(padOp); auto copyOp = builder.create( padOp.getLoc(), padOp.getType(), builder.getType(), - padOp.getSource(), padOp.getHigh(), padOp.getStaticHighAttr()); + padOp.getSource(), padOp.getHigh(), padOp.getStaticHighAttr(), + undefPadding); auto waitOp = builder.create( padOp.getLoc(), copyOp.getResult(), copyOp.getToken(), /*copy=*/padOp.getSource()); diff --git a/codegen/tests/Dialect/Snitch/IR/bufferization.mlir b/codegen/tests/Dialect/Snitch/IR/bufferization.mlir index 7aa9088..97d07c3 100644 --- a/codegen/tests/Dialect/Snitch/IR/bufferization.mlir +++ b/codegen/tests/Dialect/Snitch/IR/bufferization.mlir @@ -165,3 +165,26 @@ func.func @tensor_copy_pad(%arg0 : tensor, %pad0 : index, %pad1 : index // CHECK: return %[[TENSOR]], %[[TOKEN]] return %r, %t : tensor, !quidditch_snitch.dma_token } + +// CHECK-LABEL: @tensor_copy_pad_undef +// CHECK-SAME: %[[ARG0:[[:alnum:]]+]] +// CHECK-SAME: %[[PAD0:[[:alnum:]]+]] +// CHECK-SAME: %[[PAD1:[[:alnum:]]+]] +func.func @tensor_copy_pad_undef(%arg0 : tensor, %pad0 : index, %pad1 : index) -> (tensor, !quidditch_snitch.dma_token) { + // CHECK: %[[COPY:.*]] = bufferization.to_memref %[[ARG0]] + // CHECK: %[[ZERO:.*]] = arith.constant 0 + // CHECK: %[[DIM0:.*]] = memref.dim %[[COPY]], %[[ZERO]] + // CHECK: %[[ONE:.*]] = arith.constant 1 + // CHECK: %[[DIM1:.*]] = memref.dim %[[COPY]], %[[ONE]] + // CHECK: %[[NEW_DIM0:.*]] = affine.apply #[[$MAP2]]()[%[[DIM0]], %[[PAD0]]] + // CHECK: %[[NEW_DIM1:.*]] = affine.apply #[[$MAP2]]()[%[[DIM1]], %[[PAD1]]] + // CHECK: %[[ALLOC:.*]] = memref.alloc(%[[NEW_DIM0]], %[[NEW_DIM1]]) + // CHECK-NOT: start_zero_mem_transfer + // CHECK: %[[UNPADDED:.*]] = memref.subview %[[ALLOC]][0, 0] [%[[DIM0]], %[[DIM1]]] [1, 1] + // CHECK-NEXT: %[[TOKEN:.*]] = quidditch_snitch.start_dma_transfer from %[[COPY]] + // CHECK-SAME: to %[[UNPADDED]] + %r, %t = quidditch_snitch.start_tensor_copy %arg0 to L1 pad with undef to [%pad0, %pad1] : tensor -> tensor + // CHECK: %[[TENSOR:.*]] = bufferization.to_tensor %[[ALLOC]] + // CHECK: return %[[TENSOR]], %[[TOKEN]] + return %r, %t : tensor, !quidditch_snitch.dma_token +} diff --git a/codegen/tests/Dialect/Snitch/Transforms/promote-pads-to-l1.mlir b/codegen/tests/Dialect/Snitch/Transforms/promote-pads-to-l1.mlir index de95548..4359b9e 100644 --- a/codegen/tests/Dialect/Snitch/Transforms/promote-pads-to-l1.mlir +++ b/codegen/tests/Dialect/Snitch/Transforms/promote-pads-to-l1.mlir @@ -22,7 +22,7 @@ func.func @test_zero_f32(%a : tensor<32x32xf32>) -> tensor<33x33xf32> { func.func @test_poison(%a : tensor<32x32xf32>) -> tensor<33x33xf32> { %c = ub.poison : f32 // CHECK: %[[R:.*]], %[[T:.*]] = quidditch_snitch.start_tensor_copy %[[A]] - // CHECK-SAME: pad with zero to [1, 1] + // CHECK-SAME: pad with undef to [1, 1] // CHECK: %[[R2:.*]] = quidditch_snitch.wait_for_tensor_copy of %[[A]] // CHECK-SAME: to %[[R]] // CHECK-SAME: using %[[T]]