[quidditch_snitch] Add padding capabilities to start_tensor_copy (#116

) We occasionally encounter shapes that are challenging to tile due to their prime factors involved. Attempting to distribute these (e.g. to compute cores or vector lanes) when the number of required tiles is not a factor of the dimension leads to generating dynamic dimensions which the microkernel compilation is unable to deal with. Similarly, once we are on `f32`, we are required to vectorize the kernel and have a restriction that the tile size of e.g. a matvec is a multiple of 4, 8 etc. This PR therefore introduces optional padding to the `start_dma_transfer` op that can be added at the end of each tensor dimension. When tiled, the padding can be chosen to guarantee that a tensor is always of a given static shape, solving the issue noted above. For now, the value used for padding is always zero which works for any matmul, elementwise operation and convolution. Note that the padding option is not yet used in the pipeline but will be lowered to from `tensor.pad` operations in a future PR.
opencompl · Aug 19, 2024 · 812da1a · 812da1a
1 parent e3a9101
commit 812da1a
Show file tree

Hide file tree

Showing 7 changed files with 242 additions and 56 deletions.
diff --git a/codegen/compiler/src/Quidditch/Dialect/Snitch/IR/QuidditchSnitchOps.cpp b/codegen/compiler/src/Quidditch/Dialect/Snitch/IR/QuidditchSnitchOps.cpp
@@ -1,6 +1,7 @@
 #include "QuidditchSnitchOps.h"
 
 #include "llvm/ADT/ScopeExit.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
@@ -399,34 +400,98 @@ void MicrokernelFenceOp::replaceWithNoop(RewriterBase &rewriter) {
 // StartTensorCopyOp
 //===----------------------------------------------------------------------===//
 
+LogicalResult StartTensorCopyOp::verify() {
+  if (getStaticHighPadAttr())
+    if (getStaticHighPadAttr().size() != getCopy().getType().getRank())
+      return emitOpError("expected padding number for every dimension");
+
+  unsigned numDynamicPads = llvm::count(
+      getStaticHighPad().value_or(std::nullopt), ShapedType::kDynamic);
+  if (numDynamicPads != getHighPad().size())
+    return emitOpError("expected ")
+           << numDynamicPads << " dynamic padding values";
+
+  return success();
+}
+
 LogicalResult StartTensorCopyOp::fold(FoldAdaptor adaptor,
                                       SmallVectorImpl<OpFoldResult> &results) {
+  if (hasPadding()) {
+    // Remove noop padding.
+    if (llvm::all_of(getStaticHighPadAttr().asArrayRef(),
+                     [](int64_t value) { return value == 0; })) {
+      removeStaticHighPadAttr();
+      return success();
+    }
+
+    // Fold dynamic indices with constant values into the static list.
+    {
+      bool changed = false;
+      SmallVector<int64_t> padding =
+          llvm::to_vector(getStaticHighPadAttr().asArrayRef());
+      unsigned dynamicIndex = 0;
+      for (int64_t &value : padding) {
+        if (!ShapedType::isDynamic(value))
+          continue;
+
+        if (auto integer = dyn_cast_or_null<IntegerAttr>(
+                adaptor.getHighPad()[dynamicIndex])) {
+          value = integer.getValue().getZExtValue();
+          getHighPadMutable().erase(dynamicIndex);
+          changed = true;
+        } else {
+          dynamicIndex++;
+        }
+      }
+      if (changed) {
+        setStaticHighPad(padding);
+        return success();
+      }
+    }
+  }
+
   auto waitOp = getCopy().getDefiningOp<WaitForTensorCopyOp>();
   if (!waitOp)
     return failure();
   auto copyOp = waitOp.getTransferTensor().getDefiningOp<StartTensorCopyOp>();
   if (!copyOp)
     return failure();
+  if (copyOp.getStaticHighPadAttr() != getStaticHighPadAttr() ||
+      copyOp.getHighPad() != getHighPad())
+    return failure();
 
   results.emplace_back(waitOp);
   results.emplace_back(CompletedTokenAttr::get(getContext()));
   return success();
 }
 
+SmallVector<OpFoldResult> StartTensorCopyOp::getMixedHighPad() {
+  Builder builder(getContext());
+  if (!hasPadding())
+    return SmallVector<OpFoldResult>(getResult().getType().getRank(),
+                                     builder.getIndexAttr(0));
+
+  return getMixedValues(getStaticHighPadAttr().asArrayRef(), getHighPad(),
+                        builder);
+}
+
 //===----------------------------------------------------------------------===//
 // StartTensorCopyOp::BufferizableOpInterface
 //===----------------------------------------------------------------------===//
 
-/// Returns whether 'copy' is already in L1 memory.
+/// Returns whether the allocation can be elided entirely.
 /// Returns an empty optional if it was not possible to determine.
-static std::optional<bool>
-isInL1Memory(Value copy,
-             const bufferization::BufferizationOptions &options = {},
-             SmallVector<Value> *invocationStack = nullptr) {
+std::optional<bool> StartTensorCopyOp::elidesAllocation(
+    const bufferization::BufferizationOptions &options,
+    SmallVector<Value> *invocationStack) {
+  // Padding cannot be elided in general, even if the copied buffer is in L1.
+  if (hasPadding())
+    return false;
+
   FailureOr<BaseMemRefType> copyType =
       invocationStack
-          ? bufferization::getBufferType(copy, options, *invocationStack)
-          : bufferization::getBufferType(copy, options);
+          ? bufferization::getBufferType(getCopy(), options, *invocationStack)
+          : bufferization::getBufferType(getCopy(), options);
   if (failed(copyType))
     return std::nullopt;
 
@@ -437,7 +502,7 @@ bool StartTensorCopyOp::resultBufferizesToMemoryWrite(
     OpResult opResult, const bufferization::AnalysisState &state) {
   assert(opResult == getResult() && "no other result");
 
-  std::optional<bool> matches = isInL1Memory(getCopy(), state.getOptions());
+  std::optional<bool> matches = elidesAllocation(state.getOptions());
   // Conservative answer.
   if (!matches)
     return true;
@@ -451,7 +516,7 @@ bool StartTensorCopyOp::bufferizesToMemoryRead(
     OpOperand &opOperand, const bufferization::AnalysisState &state) {
   assert(opOperand == getCopyMutable() && "have only one operand");
 
-  std::optional<bool> result = isInL1Memory(getCopy(), state.getOptions());
+  std::optional<bool> result = elidesAllocation(state.getOptions());
   // Conservative answer.
   if (!result)
     return true;
@@ -472,7 +537,7 @@ AliasingValueList StartTensorCopyOp::getAliasingValues(
     OpOperand &opOperand, const bufferization::AnalysisState &state) {
   assert(opOperand == getCopyMutable() && "have only one operand");
 
-  std::optional<bool> result = isInL1Memory(getCopy(), state.getOptions());
+  std::optional<bool> result = elidesAllocation(state.getOptions());
   if (!result)
     // Assume the worst case.
     return {{getResult(), BufferRelation::Equivalent, /*isDefinite=*/false}};
@@ -488,7 +553,7 @@ AliasingValueList StartTensorCopyOp::getAliasingValues(
 bool StartTensorCopyOp::bufferizesToAllocation(Value value) {
   assert(value == getResult() && "have only one result");
 
-  if (isInL1Memory(getCopy()) == true)
+  if (elidesAllocation() == true)
     return false;
 
   // True is the conservative reply, according to the docs.
@@ -503,7 +568,7 @@ StartTensorCopyOp::getBufferType(Value value,
 
   bool contained = llvm::is_contained(invocationStack, value);
   if (!contained)
-    if (isInL1Memory(getCopy(), options, &invocationStack) == true)
+    if (elidesAllocation(options, &invocationStack) == true)
       return bufferization::getBufferType(getCopy(), options, invocationStack);
 
   // Unless contained in the invocation stack (where we are free to impose the
@@ -530,7 +595,7 @@ StartTensorCopyOp::bufferize(RewriterBase &rewriter,
   if (failed(copyBuffer))
     return failure();
 
-  std::optional<bool> result = isInL1Memory(getCopy(), options);
+  std::optional<bool> result = elidesAllocation(options);
   if (!result)
     return failure();
 
@@ -546,12 +611,20 @@ StartTensorCopyOp::bufferize(RewriterBase &rewriter,
   if (failed(allocType))
     return failure();
 
+  SmallVector<OpFoldResult> copyBufferSizes =
+      memref::getMixedSizes(rewriter, getLoc(), *copyBuffer);
+
+  // Compute the dynamic dimensions for the allocation.
   SmallVector<Value> dynamicDims;
-  for (auto [index, shape] : llvm::enumerate(allocType->getShape())) {
+  for (auto [index, shape, pad] :
+       llvm::enumerate(allocType->getShape(), getMixedHighPad())) {
     if (!ShapedType::isDynamic(shape))
       continue;
-    dynamicDims.push_back(
-        rewriter.create<memref::DimOp>(getLoc(), *copyBuffer, index));
+
+    dynamicDims.push_back(affine::makeComposedAffineApply(
+        rewriter, getLoc(),
+        rewriter.getAffineDimExpr(0) + rewriter.getAffineDimExpr(1),
+        ArrayRef<OpFoldResult>{copyBufferSizes[index], pad}));
   }
 
   FailureOr<Value> alloc = options.createAlloc(
@@ -560,8 +633,25 @@ StartTensorCopyOp::bufferize(RewriterBase &rewriter,
   if (failed(alloc))
     return failure();
 
+  // Zero out the entire buffer prior to overwriting it with the copied values.
+  // TODO: This could be optimized to only zero regions that won't be filled
+  //  with the copied values at the cost of 2^rank transfers instead of two.
+  if (hasPadding())
+    rewriter.create<StartZeroMemTransferOp>(getLoc(), *alloc);
+
+  // Subview into the original memory without any padding.
+  // As we only add padding at the end of the dimensions, the offsets are always
+  // zero.
+  Value destination = rewriter.create<memref::SubViewOp>(
+      getLoc(), *alloc,
+      /*offsets=*/
+      SmallVector<OpFoldResult>(allocType->getRank(), rewriter.getIndexAttr(0)),
+      copyBufferSizes,
+      /*strides=*/
+      SmallVector<OpFoldResult>(allocType->getRank(),
+                                rewriter.getIndexAttr(1)));
   Value token =
-      rewriter.create<StartDMATransferOp>(getLoc(), *copyBuffer, *alloc);
+      rewriter.create<StartDMATransferOp>(getLoc(), *copyBuffer, destination);
 
   // Replace op.
   replaceOpWithBufferizedValues(rewriter, getOperation(), {*alloc, token});

diff --git a/codegen/compiler/src/Quidditch/Dialect/Snitch/IR/QuidditchSnitchOps.td b/codegen/compiler/src/Quidditch/Dialect/Snitch/IR/QuidditchSnitchOps.td
@@ -175,39 +175,74 @@ def QuidditchSnitch_MicrokernelFenceOp : QuidditchSnitch_Op<"microkernel_fence",
 }
 
 def QuidditchSnitch_StartTensorCopyOp : QuidditchSnitch_Op<"start_tensor_copy",
-  [AllTypesMatch<["copy", "result"]>, Pure,
+  [Pure, AllRanksMatch<["copy", "result"]>,
    DeclareOpInterfaceMethods<BufferizableOpInterface,
     ["resultBufferizesToMemoryWrite", "bufferizesToMemoryRead",
      "bufferizesToMemoryWrite", "getAliasingValues", "getBufferType",
       "bufferize", "bufferizesToAllocation"]>]> {
 
   let description = [{
-    Operation starting a copy of a tensor to L1 memory space returning it as
-    a new tensor.
-    The contained values of the tensor in an unspecified state.
+    Operation starting a copy of a tensor to L1 memory space, optionally adding
+    padding and returning it as a new tensor.
+    The contained values of the resulting tensor is in an unspecified state.
     See `wait_for_tensor_copy` to transform the tensor value into a state
     equal to `$copy`.
 
-    This operation is a noop if `$copy` and `$result` are already in L1 and
-    bufferization can elide the copy.
+    The operation may optionally add padding at the end of each dimension of
+    the tensor. Zero is used as the padding value.
+    The dimensions of the result tensor are computed using
+    `dims(copy)[i] + high_pad[i]`.
+
+    This operation is a noop if `$copy` is already in L1, no padding is added,
+    and bufferization can elide the copy.
   }];
 
-  let arguments = (ins AnyRankedTensor:$copy);
+  let arguments = (ins AnyRankedTensor:$copy,
+                       Variadic<Index>:$high_pad,
+                       OptionalAttr<DenseI64ArrayAttr>:$static_high_pad
+  );
 
   let results = (outs
     AnyRankedTensor:$result,
     QuidditchSnitch_DMATokenType:$token
   );
 
   let assemblyFormat = [{
-    $copy `to` `L1` `:` type($copy) attr-dict
+    $copy `to` `L1`
+    ( `pad` `with` `zero` `to`
+      custom<DynamicIndexList>($high_pad, $static_high_pad)^)?
+    `:` type($copy) `->` type($result) attr-dict
+  }];
+
+  let builders = [
+    OpBuilder<(ins "mlir::Value":$copy), [{
+      build($_builder, $_state, copy.getType(),
+            $_builder.getType<DMATokenType>(), copy,
+            /*high_pad=*/mlir::ValueRange(), /*static_high_pad=*/nullptr);
+    }]>
+  ];
+
+  let hasVerifier = 1;
+
+  let extraClassDeclaration = [{
+  private:
+    std::optional<bool>
+    elidesAllocation(const mlir::bufferization::BufferizationOptions &options = {},
+                      llvm::SmallVector<mlir::Value> *invocationStack = nullptr);
+  public:
+
+    bool hasPadding() {
+      return static_cast<bool>(getStaticHighPadAttr());
+    }
+
+    llvm::SmallVector<mlir::OpFoldResult> getMixedHighPad();
   }];
 
   let hasFolder = 1;
 }
 
 def QuidditchSnitch_WaitForTensorCopyOp : QuidditchSnitch_Op<"wait_for_tensor_copy",
-  [AllTypesMatch<["transfer_tensor", "result", "copy"]>, Pure,
+  [AllTypesMatch<["transfer_tensor", "result"]>, Pure,
    DeclareOpInterfaceMethods<BufferizableOpInterface,
     ["bufferizesToMemoryRead", "bufferizesToMemoryWrite", "getAliasingValues",
      "bufferize", "mustBufferizeInPlace", "isNotConflicting"]>]> {
@@ -240,7 +275,7 @@ def QuidditchSnitch_WaitForTensorCopyOp : QuidditchSnitch_Op<"wait_for_tensor_co
   );
 
   let assemblyFormat = [{
-    `of` $copy `to` $transfer_tensor `using` $token `:` type($transfer_tensor) attr-dict
+    `of` $copy `:` type($copy) `to` $transfer_tensor `using` $token `->` type($transfer_tensor) attr-dict
   }];
 
   let hasFolder = 1;