Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[TileAndFuse] Add thread groups for convolution ops #695

Merged
merged 4 commits into from
Sep 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#include "iree-amd-aie/Transforms/Passes.h"
#include "iree/compiler/Codegen/Utils/Utils.h"
#include "llvm/ADT/StringExtras.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/Linalg/IR/Linalg.h"
#include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h"
Expand All @@ -16,16 +17,128 @@
#include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h"
#include "mlir/Dialect/SCF/Transforms/Transforms.h"
#include "mlir/Dialect/Utils/StaticValueUtils.h"
#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
#include "mlir/IR/Iterators.h"
#include "mlir/IR/PatternMatch.h"

#define DEBUG_TYPE "iree-amdaie-tile-and-fuse"


namespace mlir::iree_compiler::AMDAIE {

namespace {

enum class GPUGroupType { Block, Thread };

/// Assign GPU dialect thread/block mapping attributes to tiled dimensions.
/// The returned vector's size is the number of non-zero values in
/// `tileSizesVal`. Failure is returned if it is not possible to assign
/// mapping attributes to the dimensions.
FailureOr<SmallVector<Attribute>> getGPUMappingAttributes(
ArrayRef<int64_t> tileSizesVal, GPUGroupType groupType,
TilingInterface op) {
MLIRContext *context = op.getContext();

// There is one induction variables in the scf.forall for each of the
// non-zero tile sizes. Recall that a '0' tile size corresponds to 'do
// not tile'.
uint32_t nbIndVars = std::count_if(tileSizesVal.begin(), tileSizesVal.end(),
[](int64_t t) { return t != 0; });

uint32_t nbIndVarsAboveOne =
std::count_if(tileSizesVal.begin(), tileSizesVal.end(),
[](int64_t t) { return t > 1; });

// The mlir::gpu::MappingId enum supports 13 dimensions, see:
// https://github.com/llvm/llvm-project/blob/main
// /mlir/include/mlir/Dialect/GPU/IR/GPUDeviceMappingAttr.td
if (nbIndVars > mlir::gpu::getMaxEnumValForMappingId()) {
return op->emitOpError("has too many dimensions to tile, ")
<< "there are only " << mlir::gpu::getMaxEnumValForMappingId()
<< " dimensions available in the mlir::gpu dialect, but "
<< nbIndVars << " are required here..";
}

// Currently we expect only 2 tiled dimensions to be >1 when mapping
// to thread dimensions. This is to target the 2-D AIE array.
//
// TODO(newling) if there are 3+ dimensions, we probably need to collapse
// them into just 2. I'm leaving this as a follow-up task. Basically, instead
// of
// ```(i,j,k) in (2,3,5)```
// we want
// ```(i,l) in (2,15)```
// with then
// j=l/5 and k=l%5.
//
// Once the above is implemented, we can safely remove the following check:
if (nbIndVarsAboveOne > 2 && groupType == GPUGroupType::Thread) {
auto tileSizesStr = std::to_string(tileSizesVal[0]);
for (unsigned i = 1; i < tileSizesVal.size(); ++i) {
tileSizesStr += ", " + std::to_string(tileSizesVal[i]);
}
return op->emitOpError("has requested tile sizes [")
<< tileSizesStr
<< "]. Currently we only support tiling thread dimensions "
<< "with at most 2 dimensions having a tile size greater than 1, "
<< "there are " << nbIndVarsAboveOne << " here.";
}

auto getMappingAttributeForDimension = [&](uint32_t i) -> Attribute {
auto id = static_cast<gpu::MappingId>(i);
if (groupType == GPUGroupType::Block)
return gpu::GPUBlockMappingAttr::get(context, id);
else if (groupType == GPUGroupType::Thread)
return gpu::GPUThreadMappingAttr::get(context, id);
else {
assert(false && "Unhandled group type, must be thread or block.");
}
};

// Map an integer to an Attribute as follows:
// 0 -> DimY
// 1 -> DimX
// 2 -> DimZ
// 3 -> LinearDim0
// 4 -> LinearDim1
// etc.
//
// Note that 0 and 1 are effectively swapped, because for AIE we want to
// map the first dimension to AIE array columns (or something like that).
auto getAttribute = [&](uint32_t i) -> Attribute {
if (i == 0)
return getMappingAttributeForDimension(1);
else if (i == 1)
return getMappingAttributeForDimension(0);
else
return getMappingAttributeForDimension(i);
};

// We give priority to tiling dimensions of size > 1, so that they
// preferentially get DimY and DimX.
SmallVector<Attribute> mapping(tileSizesVal.size(), {});
uint32_t nAttributes = 0;
for (uint32_t i = 0; i < tileSizesVal.size(); ++i) {
if (tileSizesVal[i] > 1) {
mapping[i] = getAttribute(nAttributes);
++nAttributes;
}
}
for (uint32_t i = 0; i < tileSizesVal.size(); ++i) {
if (!mapping[i] && tileSizesVal[i] > 0) {
mapping[i] = getAttribute(nAttributes);
++nAttributes;
}
}

// Squeeze out the empty attributes (corresponding to '0's in tileSizesVal).
SmallVector<Attribute> finalMapping;
finalMapping.reserve(nbIndVars);
for (Attribute attr : mapping) {
if (attr) finalMapping.push_back(attr);
}
return finalMapping;
}

/// Utility function to check if any of the reduction dimension is being tiled.
static bool isTilingReductionDimension(TilingInterface consumerOp,
SmallVector<int64_t> tileSizesVal) {
Expand Down Expand Up @@ -157,27 +270,33 @@ void AMDAIETileAndFusePass::runOnOperation() {

SmallVector<OpFoldResult> tileSizes =
getAsIndexOpFoldResult(context, tileSizesVal);

auto options = scf::SCFTilingOptions().setTileSizes(tileSizes);

// When tiling using scf.for we do not need to set any mapping.
if (!useSCFFor) {
options.setLoopType(scf::SCFTilingOptions::LoopType::ForallOp);
// Here we assume there are always two levels of parallel (scf.forall)
// loops, and the first level of tiling is always using scf.forall and
// mapped to blocks. Currently we are not using mapping attributes for
// Conv2d ops, because there could be four parallel tiling dimensions.
// TODO (vivian): create AIE specific mapping attributes.
if (!isa<linalg::ConvolutionOpInterface>(consumerOp.getOperation())) {
if (tilingLevel == 0) {
options.setMapping(
{gpu::GPUBlockMappingAttr::get(context, gpu::MappingId::DimY),
gpu::GPUBlockMappingAttr::get(context, gpu::MappingId::DimX)});
} else {
options.setMapping(
{gpu::GPUThreadMappingAttr::get(context, gpu::MappingId::DimY),
gpu::GPUThreadMappingAttr::get(context, gpu::MappingId::DimX)});
}

// Currently only thread groups are used in lowering, blocks get unrolled
// temporally. In theory we should be able to just not add any block group
// dimensions to the outer scf.forall operation, but currently this results
// in compilation failure. What happens is
// 1) without any block group dimensions, the scf.forall operation can be
// be canonicalized away if the tile sizes are all 1 (small matmul, for
// example). Leaving only the inner thread scf.forall.
// 2) certain passes expect an outer scf.forall operation, so if it is
// canonicalized away, the pass fails.
// So for now we're keeping the block group dimension here, but should
// be able to compile without any block group dimensions TODO(newling)
auto groupType =
tilingLevel == 0 ? GPUGroupType::Block : GPUGroupType::Thread;

auto maybeMapping =
getGPUMappingAttributes(tileSizesVal, groupType, consumerOp);
if (failed(maybeMapping)) {
return signalPassFailure();
}
options.setMapping(maybeMapping.value());
}

IRRewriter rewriter(context);
Expand Down Expand Up @@ -205,8 +324,7 @@ void AMDAIETileAndFusePass::runOnOperation() {
// Fuse all Linalg ops (can be generalized later)
.Default([&](Operation *op) {
return op->getDialect() ==
rewriter.getContext()
->getLoadedDialect<linalg::LinalgDialect>();
context->getLoadedDialect<linalg::LinalgDialect>();
});
return {fusableOp, false};
});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -644,6 +644,23 @@ void addMLIRAIRLoweringPasses(OpPassManager &passManager, AMDAIEDevice device) {
passManager.addPass(memref::createFoldMemRefAliasOpsPass());
passManager.addPass(createAMDAIEBridgeToAIRPass());

// Running canonicalization for all pipelines here results in failures.
// Example
// ```
// 'memref.cast' op is an unsupported operation. This pass currently only
// supports AllocOp and SubViewOp as inputs.
// ```
// It is currently required for the convolution pipeline though, to remove the
// extra (size-1) thread- and group- dimensions.
//
// TODO(newling) there are better solutions like:
// 1) make canonicalization work for scf.forall
// 2) pass to collapse rank-4 scf.foralls to rank-2 scf.foralls.
// 3) resolve above 'unsupproted operation' error.
if (clUseTilePipeline == TilePassPipeline::ConvDecomposePipeline) {
passManager.addPass(createCanonicalizerPass());
}

// TODO (Erwei): Figure out a way to work with AMDAIEPackToDmaPass.
if (clUseTilePipeline == TilePassPipeline::PackPeelPipeline)
passManager.addPass(createAMDAIEDecomposeLinalgExtPackUnPackToAIRPass());
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-amdaie-tile-and-fuse{tiling-level=0}))' --split-input-file %s | FileCheck %s --check-prefix=TILE-LEVEL-0
// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-amdaie-tile-and-fuse{tiling-level=1}))' --split-input-file %s | FileCheck %s --check-prefix=TILE-LEVEL-1
// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-amdaie-tile-and-fuse{tiling-level=0}))' --split-input-file %s | FileCheck %s --check-prefix=TILE-LEVEL-0
// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-amdaie-tile-and-fuse{tiling-level=1}))' --split-input-file --verify-diagnostics %s | FileCheck %s --check-prefix=TILE-LEVEL-1
// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-amdaie-tile-and-fuse{tiling-level=0 tile-elementwise=false}))' --split-input-file %s | FileCheck %s --check-prefix=TILE-MATMUL-ONLY

func.func @matmul_static(%arg0: tensor<8x16xi32>, %arg1 : tensor<16x8xi32>) -> tensor<8x8xi32> {
Expand Down Expand Up @@ -32,7 +32,25 @@ func.func @conv_2d_nhwc_hwcf(%arg0: tensor<2x14x14x32xbf16>, %arg1: tensor<3x3x3
// TILE-LEVEL-0-SAME: {
// TILE-LEVEL-0: linalg.fill
// TILE-LEVEL-0: linalg.conv_2d_nhwc_hwcf
// TILE-LEVEL-0: }
// TILE-LEVEL-0: } {mapping = [#gpu.block<y>, #gpu.block<x>, #gpu.block<z>]}

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In my opinion, the order of mapping attributes for block and thread should be corresponding to each other. Although we are not using block attributes at the moment, it's good to keep the attributes in the same order.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, so the logic is actually identical for threads and blocks. What's happening in this example is that the tiling sizes at thread and block levels are different:

For blocks: [0, 4, 4, 4, 0, 0, 0]
For threads: [1, 1, 4, 4, 0, 0, 0]

The logic works implemented in this PR works as follows:

First, assign attributes to dimensions with tile size greater than 1. For threads, that is dimensions 2 and 3.
Second, assign attributes to dimensions with tile size equal to 1. For threads, that is dimensions 0 and 1.

The attributes assigned in the order y then x then z then linear_dim_0, linear_dim_1 etc.

For [1, 1, 4, 4, 0, 0, 0], after step 1 the assigned dimensions are
[none, none, y, x, none, none, none]
and then after step 2 the assigned dimensions are
[z, linear_dim_0, y, x, none, none, none].

And that is why at the thread level we end up with [z, linear_dim_0, y, x].

Copy link
Contributor

@yzhang93 yzhang93 Sep 3, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the explanation. Yeah, I can follow the steps to set up mapping attributes. It's just not typical that the attributes are different for the same dim in block and thread mapping. I don't have a good solution to solve this other than hardcoding the dimensions. On the other hand, since the block attributes are not used anyway, maybe we could remove these block attributes?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just tried removing block dimension tiling, but for small matmul examples where the block tile sizes are all 1, there's a problem: scf.forall without tiling attributes get canonicalized away when the iterations space is size 1. i.e. the block-level scf.forall gets removed completely. Ideally we'd be able to work without this outer scf.forall, but currently the passes aren't set up to handle this, I guess. So removing block dimensions for matmuls isn't something we can immediately do.

I could remove them for convolution, but we might have the same issue when we have small convolutions.

So yeah, not sure. Maybe, for now, it's ok to keep the block dimensions as they are for convolution?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see... We should find a way to solve the scf.forall canonicalization problem. Could you add a TODO comment for the block mapping attributes?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, I'll add a comment. Thanks for accepting :)

// TILE-LEVEL-1: @conv_2d_nhwc_hwcf
// TILE-LEVEL-1: scf.forall
// TILE-LEVEL-1-SAME: {
// TILE-LEVEL-1: linalg.fill
// TILE-LEVEL-1: linalg.conv_2d_nhwc_hwcf
// TILE-LEVEL-1: } {mapping = [#gpu.thread<z>, #gpu.thread<linear_dim_0>, #gpu.thread<y>, #gpu.thread<x>]}

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks good to me.

// -----

func.func @conv_2d_nhwc_hwcf_unsupported_tiling(%arg0: tensor<2x14x14x32xbf16>, %arg1: tensor<3x3x32x64xbf16>) -> tensor<2x12x12x64xf32> {
%cst = arith.constant 0.000000e+00 : f32
%0 = tensor.empty() : tensor<2x12x12x64xf32>
%1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<2x12x12x64xf32>) -> tensor<2x12x12x64xf32>
// expected-error @+1 {{'linalg.conv_2d_nhwc_hwcf' op has requested tile sizes [1, 4, 4, 4, 0, 0, 0]. Currently we only support tiling thread dimensions with at most 2 dimensions having a tile size greater than 1, there are 3 here.}}
%2 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 4, 4, 4, 0, 0, 0], [1, 4, 4, 4, 0, 0, 0], [0, 0, 0, 0, 1, 1, 8]]>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<2x14x14x32xbf16>, tensor<3x3x32x64xbf16>) outs(%1 : tensor<2x12x12x64xf32>) -> tensor<2x12x12x64xf32>
return %2 : tensor<2x12x12x64xf32>
}

// -----

Expand Down
Loading