Skip to content

Commit

Permalink
[quidditch_snitch] Implement undef padding for start_tensor_copy (#120
Browse files Browse the repository at this point in the history
)

Undef padding was previously lowered to zero paddings, which involve
additional DMA transfers required to zero out an allocation. The
majority of operations do not require padding with a specific value
making the zero padding and therefore the DMA transfers redundant.
  • Loading branch information
zero9178 authored Aug 22, 2024
1 parent 6db2996 commit 49218b5
Show file tree
Hide file tree
Showing 5 changed files with 33 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -636,7 +636,7 @@ StartTensorCopyOp::bufferize(RewriterBase &rewriter,
// Zero out the entire buffer prior to overwriting it with the copied values.
// TODO: This could be optimized to only zero regions that won't be filled
// with the copied values at the cost of 2^rank transfers instead of two.
if (hasPadding())
if (hasPadding() && !getUndefPadding())
rewriter.create<StartZeroMemTransferOp>(getLoc(), *alloc);

// Subview into the original memory without any padding.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,8 @@ def QuidditchSnitch_StartTensorCopyOp : QuidditchSnitch_Op<"start_tensor_copy",

let arguments = (ins AnyRankedTensor:$copy,
Variadic<Index>:$high_pad,
OptionalAttr<DenseI64ArrayAttr>:$static_high_pad
OptionalAttr<DenseI64ArrayAttr>:$static_high_pad,
UnitAttr:$undef_padding
);

let results = (outs
Expand All @@ -209,7 +210,7 @@ def QuidditchSnitch_StartTensorCopyOp : QuidditchSnitch_Op<"start_tensor_copy",

let assemblyFormat = [{
$copy `to` `L1`
( `pad` `with` `zero` `to`
( `pad` `with` (`undef` $undef_padding^) : (`zero`)? `to`
custom<DynamicIndexList>($high_pad, $static_high_pad)^)?
`:` type($copy) `->` type($result) attr-dict
}];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,17 +101,20 @@ void PromotePadsToL1::runOnOperation() {
if (!constant)
return;

// 'start_tensor_copy' only supports zero-padding right now.
// 'start_tensor_copy' supports zero-padding and undef-padding right now.
// Poison (undef) can also be lowered to perform zero-padding.
if (!matchPattern(constant, m_NonZero()) &&
!matchPattern(constant, m_PosZeroFloat()) &&
!matchPattern(constant, m_Constant<ub::PoisonAttr>(nullptr)))
return;
bool undefPadding =
matchPattern(constant, m_Constant<ub::PoisonAttr>(nullptr));

OpBuilder builder(padOp);
auto copyOp = builder.create<StartTensorCopyOp>(
padOp.getLoc(), padOp.getType(), builder.getType<DMATokenType>(),
padOp.getSource(), padOp.getHigh(), padOp.getStaticHighAttr());
padOp.getSource(), padOp.getHigh(), padOp.getStaticHighAttr(),
undefPadding);
auto waitOp = builder.create<WaitForTensorCopyOp>(
padOp.getLoc(), copyOp.getResult(), copyOp.getToken(),
/*copy=*/padOp.getSource());
Expand Down
23 changes: 23 additions & 0 deletions codegen/tests/Dialect/Snitch/IR/bufferization.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -165,3 +165,26 @@ func.func @tensor_copy_pad(%arg0 : tensor<?x?xf32>, %pad0 : index, %pad1 : index
// CHECK: return %[[TENSOR]], %[[TOKEN]]
return %r, %t : tensor<?x?xf32>, !quidditch_snitch.dma_token
}

// CHECK-LABEL: @tensor_copy_pad_undef
// CHECK-SAME: %[[ARG0:[[:alnum:]]+]]
// CHECK-SAME: %[[PAD0:[[:alnum:]]+]]
// CHECK-SAME: %[[PAD1:[[:alnum:]]+]]
func.func @tensor_copy_pad_undef(%arg0 : tensor<?x?xf32>, %pad0 : index, %pad1 : index) -> (tensor<?x?xf32>, !quidditch_snitch.dma_token) {
// CHECK: %[[COPY:.*]] = bufferization.to_memref %[[ARG0]]
// CHECK: %[[ZERO:.*]] = arith.constant 0
// CHECK: %[[DIM0:.*]] = memref.dim %[[COPY]], %[[ZERO]]
// CHECK: %[[ONE:.*]] = arith.constant 1
// CHECK: %[[DIM1:.*]] = memref.dim %[[COPY]], %[[ONE]]
// CHECK: %[[NEW_DIM0:.*]] = affine.apply #[[$MAP2]]()[%[[DIM0]], %[[PAD0]]]
// CHECK: %[[NEW_DIM1:.*]] = affine.apply #[[$MAP2]]()[%[[DIM1]], %[[PAD1]]]
// CHECK: %[[ALLOC:.*]] = memref.alloc(%[[NEW_DIM0]], %[[NEW_DIM1]])
// CHECK-NOT: start_zero_mem_transfer
// CHECK: %[[UNPADDED:.*]] = memref.subview %[[ALLOC]][0, 0] [%[[DIM0]], %[[DIM1]]] [1, 1]
// CHECK-NEXT: %[[TOKEN:.*]] = quidditch_snitch.start_dma_transfer from %[[COPY]]
// CHECK-SAME: to %[[UNPADDED]]
%r, %t = quidditch_snitch.start_tensor_copy %arg0 to L1 pad with undef to [%pad0, %pad1] : tensor<?x?xf32> -> tensor<?x?xf32>
// CHECK: %[[TENSOR:.*]] = bufferization.to_tensor %[[ALLOC]]
// CHECK: return %[[TENSOR]], %[[TOKEN]]
return %r, %t : tensor<?x?xf32>, !quidditch_snitch.dma_token
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ func.func @test_zero_f32(%a : tensor<32x32xf32>) -> tensor<33x33xf32> {
func.func @test_poison(%a : tensor<32x32xf32>) -> tensor<33x33xf32> {
%c = ub.poison : f32
// CHECK: %[[R:.*]], %[[T:.*]] = quidditch_snitch.start_tensor_copy %[[A]]
// CHECK-SAME: pad with zero to [1, 1]
// CHECK-SAME: pad with undef to [1, 1]
// CHECK: %[[R2:.*]] = quidditch_snitch.wait_for_tensor_copy of %[[A]]
// CHECK-SAME: to %[[R]]
// CHECK-SAME: using %[[T]]
Expand Down

0 comments on commit 49218b5

Please sign in to comment.