Skip to content

Commit

Permalink
[quidditch_snitch] Implement undef padding for start_tensor_copy
Browse files Browse the repository at this point in the history
Undef padding was previously lowered to zero paddings, which involve additional DMA transfers required to zero out an allocation.
The majority of operations do not require padding with a specific value making the zero padding and therefore the DMA transfers redundant.
  • Loading branch information
zero9178 committed Aug 21, 2024
1 parent 6db2996 commit 6d90b07
Show file tree
Hide file tree
Showing 5 changed files with 33 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -636,7 +636,7 @@ StartTensorCopyOp::bufferize(RewriterBase &rewriter,
// Zero out the entire buffer prior to overwriting it with the copied values.
// TODO: This could be optimized to only zero regions that won't be filled
// with the copied values at the cost of 2^rank transfers instead of two.
if (hasPadding())
if (hasPadding() && !getUndefPadding())
rewriter.create<StartZeroMemTransferOp>(getLoc(), *alloc);

// Subview into the original memory without any padding.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,8 @@ def QuidditchSnitch_StartTensorCopyOp : QuidditchSnitch_Op<"start_tensor_copy",

let arguments = (ins AnyRankedTensor:$copy,
Variadic<Index>:$high_pad,
OptionalAttr<DenseI64ArrayAttr>:$static_high_pad
OptionalAttr<DenseI64ArrayAttr>:$static_high_pad,
UnitAttr:$undef_padding
);

let results = (outs
Expand All @@ -209,7 +210,7 @@ def QuidditchSnitch_StartTensorCopyOp : QuidditchSnitch_Op<"start_tensor_copy",

let assemblyFormat = [{
$copy `to` `L1`
( `pad` `with` `zero` `to`
( `pad` `with` (`undef` $undef_padding^) : (`zero`)? `to`
custom<DynamicIndexList>($high_pad, $static_high_pad)^)?
`:` type($copy) `->` type($result) attr-dict
}];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,17 +101,20 @@ void PromotePadsToL1::runOnOperation() {
if (!constant)
return;

// 'start_tensor_copy' only supports zero-padding right now.
// 'start_tensor_copy' supports zero-padding and undef-padding right now.
// Poison (undef) can also be lowered to perform zero-padding.
if (!matchPattern(constant, m_NonZero()) &&
!matchPattern(constant, m_PosZeroFloat()) &&
!matchPattern(constant, m_Constant<ub::PoisonAttr>(nullptr)))
return;
bool undefPadding =
matchPattern(constant, m_Constant<ub::PoisonAttr>(nullptr));

OpBuilder builder(padOp);
auto copyOp = builder.create<StartTensorCopyOp>(
padOp.getLoc(), padOp.getType(), builder.getType<DMATokenType>(),
padOp.getSource(), padOp.getHigh(), padOp.getStaticHighAttr());
padOp.getSource(), padOp.getHigh(), padOp.getStaticHighAttr(),
undefPadding);
auto waitOp = builder.create<WaitForTensorCopyOp>(
padOp.getLoc(), copyOp.getResult(), copyOp.getToken(),
/*copy=*/padOp.getSource());
Expand Down
23 changes: 23 additions & 0 deletions codegen/tests/Dialect/Snitch/IR/bufferization.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -165,3 +165,26 @@ func.func @tensor_copy_pad(%arg0 : tensor<?x?xf32>, %pad0 : index, %pad1 : index
// CHECK: return %[[TENSOR]], %[[TOKEN]]
return %r, %t : tensor<?x?xf32>, !quidditch_snitch.dma_token
}

// CHECK-LABEL: @tensor_copy_pad_undef
// CHECK-SAME: %[[ARG0:[[:alnum:]]+]]
// CHECK-SAME: %[[PAD0:[[:alnum:]]+]]
// CHECK-SAME: %[[PAD1:[[:alnum:]]+]]
func.func @tensor_copy_pad_undef(%arg0 : tensor<?x?xf32>, %pad0 : index, %pad1 : index) -> (tensor<?x?xf32>, !quidditch_snitch.dma_token) {
// CHECK: %[[COPY:.*]] = bufferization.to_memref %[[ARG0]]
// CHECK: %[[ZERO:.*]] = arith.constant 0
// CHECK: %[[DIM0:.*]] = memref.dim %[[COPY]], %[[ZERO]]
// CHECK: %[[ONE:.*]] = arith.constant 1
// CHECK: %[[DIM1:.*]] = memref.dim %[[COPY]], %[[ONE]]
// CHECK: %[[NEW_DIM0:.*]] = affine.apply #[[$MAP2]]()[%[[DIM0]], %[[PAD0]]]
// CHECK: %[[NEW_DIM1:.*]] = affine.apply #[[$MAP2]]()[%[[DIM1]], %[[PAD1]]]
// CHECK: %[[ALLOC:.*]] = memref.alloc(%[[NEW_DIM0]], %[[NEW_DIM1]])
// CHECK-NOT: start_zero_mem_transfer
// CHECK: %[[UNPADDED:.*]] = memref.subview %[[ALLOC]][0, 0] [%[[DIM0]], %[[DIM1]]] [1, 1]
// CHECK-NEXT: %[[TOKEN:.*]] = quidditch_snitch.start_dma_transfer from %[[COPY]]
// CHECK-SAME: to %[[UNPADDED]]
%r, %t = quidditch_snitch.start_tensor_copy %arg0 to L1 pad with undef to [%pad0, %pad1] : tensor<?x?xf32> -> tensor<?x?xf32>
// CHECK: %[[TENSOR:.*]] = bufferization.to_tensor %[[ALLOC]]
// CHECK: return %[[TENSOR]], %[[TOKEN]]
return %r, %t : tensor<?x?xf32>, !quidditch_snitch.dma_token
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ func.func @test_zero_f32(%a : tensor<32x32xf32>) -> tensor<33x33xf32> {
func.func @test_poison(%a : tensor<32x32xf32>) -> tensor<33x33xf32> {
%c = ub.poison : f32
// CHECK: %[[R:.*]], %[[T:.*]] = quidditch_snitch.start_tensor_copy %[[A]]
// CHECK-SAME: pad with zero to [1, 1]
// CHECK-SAME: pad with undef to [1, 1]
// CHECK: %[[R2:.*]] = quidditch_snitch.wait_for_tensor_copy of %[[A]]
// CHECK-SAME: to %[[R]]
// CHECK-SAME: using %[[T]]
Expand Down

0 comments on commit 6d90b07

Please sign in to comment.