Skip to content

Commit

Permalink
[quidditch_snitch] Add padding capabilities to start_tensor_copy (#116
Browse files Browse the repository at this point in the history
)

We occasionally encounter shapes that are challenging to tile due to
their prime factors involved. Attempting to distribute these (e.g. to
compute cores or vector lanes) when the number of required tiles is not
a factor of the dimension leads to generating dynamic dimensions which
the microkernel compilation is unable to deal with. Similarly, once we
are on `f32`, we are required to vectorize the kernel and have a
restriction that the tile size of e.g. a matvec is a multiple of 4, 8
etc.

This PR therefore introduces optional padding to the
`start_dma_transfer` op that can be added at the end of each tensor
dimension. When tiled, the padding can be chosen to guarantee that a
tensor is always of a given static shape, solving the issue noted above.
For now, the value used for padding is always zero which works for any
matmul, elementwise operation and convolution.

Note that the padding option is not yet used in the pipeline but will be
lowered to from `tensor.pad` operations in a future PR.
  • Loading branch information
zero9178 authored Aug 19, 2024
1 parent e3a9101 commit 812da1a
Show file tree
Hide file tree
Showing 7 changed files with 242 additions and 56 deletions.
124 changes: 107 additions & 17 deletions codegen/compiler/src/Quidditch/Dialect/Snitch/IR/QuidditchSnitchOps.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "QuidditchSnitchOps.h"

#include "llvm/ADT/ScopeExit.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
#include "mlir/Dialect/Linalg/IR/Linalg.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
Expand Down Expand Up @@ -399,34 +400,98 @@ void MicrokernelFenceOp::replaceWithNoop(RewriterBase &rewriter) {
// StartTensorCopyOp
//===----------------------------------------------------------------------===//

LogicalResult StartTensorCopyOp::verify() {
if (getStaticHighPadAttr())
if (getStaticHighPadAttr().size() != getCopy().getType().getRank())
return emitOpError("expected padding number for every dimension");

unsigned numDynamicPads = llvm::count(
getStaticHighPad().value_or(std::nullopt), ShapedType::kDynamic);
if (numDynamicPads != getHighPad().size())
return emitOpError("expected ")
<< numDynamicPads << " dynamic padding values";

return success();
}

LogicalResult StartTensorCopyOp::fold(FoldAdaptor adaptor,
SmallVectorImpl<OpFoldResult> &results) {
if (hasPadding()) {
// Remove noop padding.
if (llvm::all_of(getStaticHighPadAttr().asArrayRef(),
[](int64_t value) { return value == 0; })) {
removeStaticHighPadAttr();
return success();
}

// Fold dynamic indices with constant values into the static list.
{
bool changed = false;
SmallVector<int64_t> padding =
llvm::to_vector(getStaticHighPadAttr().asArrayRef());
unsigned dynamicIndex = 0;
for (int64_t &value : padding) {
if (!ShapedType::isDynamic(value))
continue;

if (auto integer = dyn_cast_or_null<IntegerAttr>(
adaptor.getHighPad()[dynamicIndex])) {
value = integer.getValue().getZExtValue();
getHighPadMutable().erase(dynamicIndex);
changed = true;
} else {
dynamicIndex++;
}
}
if (changed) {
setStaticHighPad(padding);
return success();
}
}
}

auto waitOp = getCopy().getDefiningOp<WaitForTensorCopyOp>();
if (!waitOp)
return failure();
auto copyOp = waitOp.getTransferTensor().getDefiningOp<StartTensorCopyOp>();
if (!copyOp)
return failure();
if (copyOp.getStaticHighPadAttr() != getStaticHighPadAttr() ||
copyOp.getHighPad() != getHighPad())
return failure();

results.emplace_back(waitOp);
results.emplace_back(CompletedTokenAttr::get(getContext()));
return success();
}

SmallVector<OpFoldResult> StartTensorCopyOp::getMixedHighPad() {
Builder builder(getContext());
if (!hasPadding())
return SmallVector<OpFoldResult>(getResult().getType().getRank(),
builder.getIndexAttr(0));

return getMixedValues(getStaticHighPadAttr().asArrayRef(), getHighPad(),
builder);
}

//===----------------------------------------------------------------------===//
// StartTensorCopyOp::BufferizableOpInterface
//===----------------------------------------------------------------------===//

/// Returns whether 'copy' is already in L1 memory.
/// Returns whether the allocation can be elided entirely.
/// Returns an empty optional if it was not possible to determine.
static std::optional<bool>
isInL1Memory(Value copy,
const bufferization::BufferizationOptions &options = {},
SmallVector<Value> *invocationStack = nullptr) {
std::optional<bool> StartTensorCopyOp::elidesAllocation(
const bufferization::BufferizationOptions &options,
SmallVector<Value> *invocationStack) {
// Padding cannot be elided in general, even if the copied buffer is in L1.
if (hasPadding())
return false;

FailureOr<BaseMemRefType> copyType =
invocationStack
? bufferization::getBufferType(copy, options, *invocationStack)
: bufferization::getBufferType(copy, options);
? bufferization::getBufferType(getCopy(), options, *invocationStack)
: bufferization::getBufferType(getCopy(), options);
if (failed(copyType))
return std::nullopt;

Expand All @@ -437,7 +502,7 @@ bool StartTensorCopyOp::resultBufferizesToMemoryWrite(
OpResult opResult, const bufferization::AnalysisState &state) {
assert(opResult == getResult() && "no other result");

std::optional<bool> matches = isInL1Memory(getCopy(), state.getOptions());
std::optional<bool> matches = elidesAllocation(state.getOptions());
// Conservative answer.
if (!matches)
return true;
Expand All @@ -451,7 +516,7 @@ bool StartTensorCopyOp::bufferizesToMemoryRead(
OpOperand &opOperand, const bufferization::AnalysisState &state) {
assert(opOperand == getCopyMutable() && "have only one operand");

std::optional<bool> result = isInL1Memory(getCopy(), state.getOptions());
std::optional<bool> result = elidesAllocation(state.getOptions());
// Conservative answer.
if (!result)
return true;
Expand All @@ -472,7 +537,7 @@ AliasingValueList StartTensorCopyOp::getAliasingValues(
OpOperand &opOperand, const bufferization::AnalysisState &state) {
assert(opOperand == getCopyMutable() && "have only one operand");

std::optional<bool> result = isInL1Memory(getCopy(), state.getOptions());
std::optional<bool> result = elidesAllocation(state.getOptions());
if (!result)
// Assume the worst case.
return {{getResult(), BufferRelation::Equivalent, /*isDefinite=*/false}};
Expand All @@ -488,7 +553,7 @@ AliasingValueList StartTensorCopyOp::getAliasingValues(
bool StartTensorCopyOp::bufferizesToAllocation(Value value) {
assert(value == getResult() && "have only one result");

if (isInL1Memory(getCopy()) == true)
if (elidesAllocation() == true)
return false;

// True is the conservative reply, according to the docs.
Expand All @@ -503,7 +568,7 @@ StartTensorCopyOp::getBufferType(Value value,

bool contained = llvm::is_contained(invocationStack, value);
if (!contained)
if (isInL1Memory(getCopy(), options, &invocationStack) == true)
if (elidesAllocation(options, &invocationStack) == true)
return bufferization::getBufferType(getCopy(), options, invocationStack);

// Unless contained in the invocation stack (where we are free to impose the
Expand All @@ -530,7 +595,7 @@ StartTensorCopyOp::bufferize(RewriterBase &rewriter,
if (failed(copyBuffer))
return failure();

std::optional<bool> result = isInL1Memory(getCopy(), options);
std::optional<bool> result = elidesAllocation(options);
if (!result)
return failure();

Expand All @@ -546,12 +611,20 @@ StartTensorCopyOp::bufferize(RewriterBase &rewriter,
if (failed(allocType))
return failure();

SmallVector<OpFoldResult> copyBufferSizes =
memref::getMixedSizes(rewriter, getLoc(), *copyBuffer);

// Compute the dynamic dimensions for the allocation.
SmallVector<Value> dynamicDims;
for (auto [index, shape] : llvm::enumerate(allocType->getShape())) {
for (auto [index, shape, pad] :
llvm::enumerate(allocType->getShape(), getMixedHighPad())) {
if (!ShapedType::isDynamic(shape))
continue;
dynamicDims.push_back(
rewriter.create<memref::DimOp>(getLoc(), *copyBuffer, index));

dynamicDims.push_back(affine::makeComposedAffineApply(
rewriter, getLoc(),
rewriter.getAffineDimExpr(0) + rewriter.getAffineDimExpr(1),
ArrayRef<OpFoldResult>{copyBufferSizes[index], pad}));
}

FailureOr<Value> alloc = options.createAlloc(
Expand All @@ -560,8 +633,25 @@ StartTensorCopyOp::bufferize(RewriterBase &rewriter,
if (failed(alloc))
return failure();

// Zero out the entire buffer prior to overwriting it with the copied values.
// TODO: This could be optimized to only zero regions that won't be filled
// with the copied values at the cost of 2^rank transfers instead of two.
if (hasPadding())
rewriter.create<StartZeroMemTransferOp>(getLoc(), *alloc);

// Subview into the original memory without any padding.
// As we only add padding at the end of the dimensions, the offsets are always
// zero.
Value destination = rewriter.create<memref::SubViewOp>(
getLoc(), *alloc,
/*offsets=*/
SmallVector<OpFoldResult>(allocType->getRank(), rewriter.getIndexAttr(0)),
copyBufferSizes,
/*strides=*/
SmallVector<OpFoldResult>(allocType->getRank(),
rewriter.getIndexAttr(1)));
Value token =
rewriter.create<StartDMATransferOp>(getLoc(), *copyBuffer, *alloc);
rewriter.create<StartDMATransferOp>(getLoc(), *copyBuffer, destination);

// Replace op.
replaceOpWithBufferizedValues(rewriter, getOperation(), {*alloc, token});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -175,39 +175,74 @@ def QuidditchSnitch_MicrokernelFenceOp : QuidditchSnitch_Op<"microkernel_fence",
}

def QuidditchSnitch_StartTensorCopyOp : QuidditchSnitch_Op<"start_tensor_copy",
[AllTypesMatch<["copy", "result"]>, Pure,
[Pure, AllRanksMatch<["copy", "result"]>,
DeclareOpInterfaceMethods<BufferizableOpInterface,
["resultBufferizesToMemoryWrite", "bufferizesToMemoryRead",
"bufferizesToMemoryWrite", "getAliasingValues", "getBufferType",
"bufferize", "bufferizesToAllocation"]>]> {

let description = [{
Operation starting a copy of a tensor to L1 memory space returning it as
a new tensor.
The contained values of the tensor in an unspecified state.
Operation starting a copy of a tensor to L1 memory space, optionally adding
padding and returning it as a new tensor.
The contained values of the resulting tensor is in an unspecified state.
See `wait_for_tensor_copy` to transform the tensor value into a state
equal to `$copy`.

This operation is a noop if `$copy` and `$result` are already in L1 and
bufferization can elide the copy.
The operation may optionally add padding at the end of each dimension of
the tensor. Zero is used as the padding value.
The dimensions of the result tensor are computed using
`dims(copy)[i] + high_pad[i]`.

This operation is a noop if `$copy` is already in L1, no padding is added,
and bufferization can elide the copy.
}];

let arguments = (ins AnyRankedTensor:$copy);
let arguments = (ins AnyRankedTensor:$copy,
Variadic<Index>:$high_pad,
OptionalAttr<DenseI64ArrayAttr>:$static_high_pad
);

let results = (outs
AnyRankedTensor:$result,
QuidditchSnitch_DMATokenType:$token
);

let assemblyFormat = [{
$copy `to` `L1` `:` type($copy) attr-dict
$copy `to` `L1`
( `pad` `with` `zero` `to`
custom<DynamicIndexList>($high_pad, $static_high_pad)^)?
`:` type($copy) `->` type($result) attr-dict
}];

let builders = [
OpBuilder<(ins "mlir::Value":$copy), [{
build($_builder, $_state, copy.getType(),
$_builder.getType<DMATokenType>(), copy,
/*high_pad=*/mlir::ValueRange(), /*static_high_pad=*/nullptr);
}]>
];

let hasVerifier = 1;

let extraClassDeclaration = [{
private:
std::optional<bool>
elidesAllocation(const mlir::bufferization::BufferizationOptions &options = {},
llvm::SmallVector<mlir::Value> *invocationStack = nullptr);
public:

bool hasPadding() {
return static_cast<bool>(getStaticHighPadAttr());
}

llvm::SmallVector<mlir::OpFoldResult> getMixedHighPad();
}];

let hasFolder = 1;
}

def QuidditchSnitch_WaitForTensorCopyOp : QuidditchSnitch_Op<"wait_for_tensor_copy",
[AllTypesMatch<["transfer_tensor", "result", "copy"]>, Pure,
[AllTypesMatch<["transfer_tensor", "result"]>, Pure,
DeclareOpInterfaceMethods<BufferizableOpInterface,
["bufferizesToMemoryRead", "bufferizesToMemoryWrite", "getAliasingValues",
"bufferize", "mustBufferizeInPlace", "isNotConflicting"]>]> {
Expand Down Expand Up @@ -240,7 +275,7 @@ def QuidditchSnitch_WaitForTensorCopyOp : QuidditchSnitch_Op<"wait_for_tensor_co
);

let assemblyFormat = [{
`of` $copy `to` $transfer_tensor `using` $token `:` type($transfer_tensor) attr-dict
`of` $copy `:` type($copy) `to` $transfer_tensor `using` $token `->` type($transfer_tensor) attr-dict
}];

let hasFolder = 1;
Expand Down
Loading

0 comments on commit 812da1a

Please sign in to comment.