Skip to content

Commit

Permalink
[PromotePadsToL1] Implement pass lowering tensor.pad (#119)
Browse files Browse the repository at this point in the history
`tensor.pad`, as we produce them, can be lowered to `start_tensor_copy`.
The latter will perform the L1 allocation and copy operations during
bufferization.

The one not-nice part of the pass is that it does so indiscriminately
for all `tensor.pad` operations. I am very unsure about whether there
are any guarantees by IREE that'd never encounter `tensor.pad`
operations in kernels.

This was also the last pass required to compile the last kernel in
NsNet2
  • Loading branch information
zero9178 authored Aug 21, 2024
1 parent 7cd4f3e commit 6db2996
Show file tree
Hide file tree
Showing 5 changed files with 100 additions and 0 deletions.
11 changes: 11 additions & 0 deletions codegen/compiler/src/Quidditch/Dialect/Snitch/Transforms/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,17 @@ def FormMicrokernelsPass
];
}

def PromotePadsToL1Pass : Pass<"quidditch-promote-pads-to-l1"> {
let description = [{
Converts supported `tensor.pad` operations to `start_tensor_transfer` and
`wait_for_tensor_copy` pairs.
}];

let dependentDialects = [
"quidditch::Snitch::QuidditchSnitchDialect",
];
}

def PromoteOperandsToL1Pass : Pass<"quidditch-promote-operands-to-l1"> {
let description = [{
TODO:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,15 @@
#include "Quidditch/Dialect/Snitch/IR/QuidditchSnitchDialect.h"
#include "Quidditch/Dialect/Snitch/IR/QuidditchSnitchOps.h"
#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
#include "mlir/Dialect/Tensor/IR/Tensor.h"
#include "mlir/Dialect/UB/IR/UBOps.h"
#include "mlir/IR/Matchers.h"
#include "mlir/Interfaces/TilingInterface.h"

namespace quidditch::Snitch {
#define GEN_PASS_DEF_PROMOTEOPERANDSTOL1PASS
#define GEN_PASS_DEF_PROMOTEALLOCSTOL1PASS
#define GEN_PASS_DEF_PROMOTEPADSTOL1PASS
#include "Quidditch/Dialect/Snitch/Transforms/Passes.h.inc"
} // namespace quidditch::Snitch

Expand All @@ -32,6 +36,16 @@ class PromoteAllocsToL1
protected:
void runOnOperation() override;
};

class PromotePadsToL1
: public quidditch::Snitch::impl::PromotePadsToL1PassBase<PromotePadsToL1> {
public:
using Base::Base;

protected:
void runOnOperation() override;
};

} // namespace

using namespace mlir;
Expand Down Expand Up @@ -76,3 +90,32 @@ void PromoteAllocsToL1::runOnOperation() {
tensorOp.erase();
});
}

void PromotePadsToL1::runOnOperation() {
getOperation()->walk([&](tensor::PadOp padOp) {
// 'start_tensor_copy' does not yet support lower padding.
if (!padOp.hasZeroLowPad())
return;

Value constant = padOp.getConstantPaddingValue();
if (!constant)
return;

// 'start_tensor_copy' only supports zero-padding right now.
// Poison (undef) can also be lowered to perform zero-padding.
if (!matchPattern(constant, m_NonZero()) &&
!matchPattern(constant, m_PosZeroFloat()) &&
!matchPattern(constant, m_Constant<ub::PoisonAttr>(nullptr)))
return;

OpBuilder builder(padOp);
auto copyOp = builder.create<StartTensorCopyOp>(
padOp.getLoc(), padOp.getType(), builder.getType<DMATokenType>(),
padOp.getSource(), padOp.getHigh(), padOp.getStaticHighAttr());
auto waitOp = builder.create<WaitForTensorCopyOp>(
padOp.getLoc(), copyOp.getResult(), copyOp.getToken(),
/*copy=*/padOp.getSource());
padOp.replaceAllUsesWith(waitOp.getResult());
padOp.erase();
});
}
8 changes: 8 additions & 0 deletions codegen/compiler/src/Quidditch/Target/ConfigureForSnitch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,14 @@ static LogicalResult setRootConfig(FunctionOpInterface funcOp,
SmallVector<int64_t> l1Tiles(3, 0);
bool dualBuffer = false;

if (funcOp.getName() ==
"main$async_dispatch_9_matmul_transpose_b_1x161x600_f64") {
workgroupTiles[2] = 100;

l1Tiles[0] = 0;
l1Tiles[1] = 56;
dualBuffer = true;
}
if (funcOp.getName() ==
"main$async_dispatch_0_matmul_transpose_b_1x400x161_f64") {
l1Tiles[1] = 40;
Expand Down
3 changes: 3 additions & 0 deletions codegen/compiler/src/Quidditch/Target/QuidditchTarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,8 @@ class QuidditchTargetBackend final : public IREE::HAL::TargetBackend {
.addPass(quidditch::createRemoveTrivialLoopsPass)
.addPass(createCanonicalizerPass)
.addPass(createCSEPass)
.addPass(createFuseTensorPadWithConsumerPass)
.addPass(createConcretizePadResultShapePass)
.addPass([] {
return quidditch::createTensorTilePass(
{quidditch::TilingLevel::Reduction});
Expand All @@ -191,6 +193,7 @@ class QuidditchTargetBackend final : public IREE::HAL::TargetBackend {
})
.addPass(createFuseTensorPadWithConsumerPass)
.addPass(createConcretizePadResultShapePass)
.addPass(quidditch::Snitch::createPromotePadsToL1Pass)
.addPass(quidditch::Snitch::createPromoteOperandsToL1Pass)
.addPass(createCanonicalizerPass)
.addPass(createCSEPass)
Expand Down
35 changes: 35 additions & 0 deletions codegen/tests/Dialect/Snitch/Transforms/promote-pads-to-l1.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
// RUN: quidditch-opt %s -p "builtin.module(func.func(quidditch-promote-pads-to-l1))" --allow-unregistered-dialect | FileCheck %s

// CHECK-LABEL: @test_zero_f32(
// CHECK-SAME: %[[A:[[:alnum:]]+]]: tensor<32x32xf32>
func.func @test_zero_f32(%a : tensor<32x32xf32>) -> tensor<33x33xf32> {
%c = arith.constant 0.0 : f32
// CHECK: %[[R:.*]], %[[T:.*]] = quidditch_snitch.start_tensor_copy %[[A]]
// CHECK-SAME: pad with zero to [1, 1]
// CHECK: %[[R2:.*]] = quidditch_snitch.wait_for_tensor_copy of %[[A]]
// CHECK-SAME: to %[[R]]
// CHECK-SAME: using %[[T]]
%0 = tensor.pad %a low[0, 0] high[1, 1] {
^bb0(%arg0: index, %arg1: index):
tensor.yield %c : f32
} : tensor<32x32xf32> to tensor<33x33xf32>
// CHECK: return %[[R2]]
return %0 : tensor<33x33xf32>
}

// CHECK-LABEL: @test_poison(
// CHECK-SAME: %[[A:[[:alnum:]]+]]: tensor<32x32xf32>
func.func @test_poison(%a : tensor<32x32xf32>) -> tensor<33x33xf32> {
%c = ub.poison : f32
// CHECK: %[[R:.*]], %[[T:.*]] = quidditch_snitch.start_tensor_copy %[[A]]
// CHECK-SAME: pad with zero to [1, 1]
// CHECK: %[[R2:.*]] = quidditch_snitch.wait_for_tensor_copy of %[[A]]
// CHECK-SAME: to %[[R]]
// CHECK-SAME: using %[[T]]
%0 = tensor.pad %a low[0, 0] high[1, 1] {
^bb0(%arg0: index, %arg1: index):
tensor.yield %c : f32
} : tensor<32x32xf32> to tensor<33x33xf32>
// CHECK: return %[[R2]]
return %0 : tensor<33x33xf32>
}

0 comments on commit 6db2996

Please sign in to comment.