Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Depthwise convolution (no new pipeline) #565

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "iree/compiler/Codegen/Utils/Utils.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/Linalg/IR/Linalg.h"
#include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h"
#include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"
#include "mlir/Dialect/Linalg/Utils/Utils.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
Expand Down Expand Up @@ -167,8 +168,13 @@ void AMDAIETileAndFusePass::runOnOperation() {
// Conv2d ops, because there could be four parallel tiling dimensions.
// Somehow `linalg::isaConvolutionOpInterface()` doesn't work properly.
// TODO (vivian): create AIE specific mapping attributes.
//
// TODO (newling) check if we can use
// "!isa<linalg::ConvolutionOpInterface>(consumerOp.getOperation()))"
//
if (!isa<linalg::Conv2DNhwcHwcfOp, linalg::Conv2DNchwFchwOp,
linalg::Conv2DNhwcHwcfQOp>(consumerOp)) {
linalg::Conv2DNhwcHwcfQOp, linalg::DepthwiseConv2DNhwcHwcOp>(
consumerOp)) {
if (tilingLevel == 0) {
options.setMapping(
{gpu::GPUBlockMappingAttr::get(context, gpu::MappingId::DimY),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "iree-amd-aie/Transforms/AMDAIEUtils.h"
#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
#include "iree/compiler/Codegen/Utils/CPUUtils.h"
#include "mlir/Dialect/Linalg/IR/Linalg.h"
#include "mlir/Dialect/MemRef/Transforms/Transforms.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"

Expand Down Expand Up @@ -465,6 +466,51 @@ static LogicalResult setRootConfigForConvDecomposePipeline(
tileSizeLevel0 = {0, OC, 4, OW, 0, 0, 0};
tileSizeLevel1 = {1, OC, 1, OW, 0, 0, 0};
tileSizeLevel2 = {0, 0, 0, 0, IC, 1, 1};
} else if (isa<linalg::DepthwiseConv2DNhwcHwcOp>(linalgOp)) {
// Notes:
// =====
// Below we target a 4x4 array of AIE cores.
//
// An inherent property of depthwise convolutions is that they cannot be
// expressed in terms of matmuls, unlike the above (dense) conv-2ds. The
// tile sizes we choose below are therefore not constrained by the AIE
// matmul instructions.
//
// The logic is currently fragile, and there are no guardrails: there are
// no checks that the data tiles are not too large, or that the input
// dimensions are perfectly tiled by the hard-coded tile dimensions below.
// These will be done as a follow-up task.

// Outer most scf.forall tiling. Defines the sizes of data tiles in L2
// (shared memory). Specifically, with the values selected below
// - the output data has 4 x 4 x 16 elements.
// - the input data has 4 x (4 + kh) x (1 + kw) x 16 elements
// - the kernel has kh x kw x 16 elements.
// TODO(newling)
// 1) check that the output tile perfectly tiles the input image
// 2) check that the sum of the L2 (memtile) allocations is within memory
// budget
tileSizeLevel0 = {
/* N */ 1, /* output height */ 4, /* output width */ 4,
/* channel */ 16, /* kernel width */ 0, /* kernel height */ 0};

// Inner-most scf.forall tiling. The iteration space corresponds to
// individual AIE cores.
tileSizeLevel1 = {
/* N */ 0, /* output height */ 1, /* output width */ 0,
/* channel */ 4, /* kernel width */ 0, /* kernel height */ 0};

// The scf.for loops that each core runs. The inner-most scf.for loop
// contains L1 allocations, which are copied to L2 at every iteration
// of the inner-most scf.for loop. These tile sizes define L1 allocation
// sizes. With the current design, we iterate over all kh x kw dimension,
// and perform a elementwise multiplication between a 4x4 tensor (from the
// input image) and a vector of size 4 (broadcast to 4x4).
tileSizeLevel2 = {
/* N */ 0, /* output height */ 0, /* output width */ 0,
/* channel */ 0, /* kernel width */ 1, /* kernel height */ 1};
} else {
assert(false && "Support must be added for this convolution op");
}
TileSizesListType tileSizes = {tileSizeLevel0, tileSizeLevel1,
tileSizeLevel2};
Expand Down Expand Up @@ -641,9 +687,10 @@ static LogicalResult setRootConfigImpl(mlir::FunctionOpInterface entryPointFn,
// add support for them, this way we can verify our work.
// TODO (vivian): add support for other conv interface ops
.Case<linalg::Conv2DNhwcHwcfOp, linalg::Conv2DNchwFchwOp,
linalg::Conv2DNhwcHwcfQOp>([&](auto op) {
return setConvRootConfig(entryPointFn, op, passPipeline, cfg);
})
linalg::Conv2DNhwcHwcfQOp, linalg::DepthwiseConv2DNhwcHwcOp>(
[&](auto op) {
return setConvRootConfig(entryPointFn, op, passPipeline, cfg);
})
.Case<linalg::GenericOp>([&](auto op) {
return setRootConfig(entryPointFn, op, passPipeline, cfg);
})
Expand Down
12 changes: 12 additions & 0 deletions tests/samples/depthwise_convolution_bf16_pipeline_e2e.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
// RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-sources %s | iree-opt --mlir-disable-threading --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-translate-target-executable-variants{target=amd-aie})))" --iree-amdaie-tile-pipeline=conv-decompose --split-input-file --mlir-print-ir-before-all | FileCheck %s

func.func @depthwise_conv_2d_nhwc_hwc(%arg0: tensor<2x14x14x64xbf16>, %arg1: tensor<3x3x64xbf16>) -> tensor<2x12x12x64xf32> {
%cst = arith.constant 0.0 : f32
%0 = tensor.empty() : tensor<2x12x12x64xf32>
%1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<2x12x12x64xf32>) -> tensor<2x12x12x64xf32>
%2 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<2x14x14x64xbf16>, tensor<3x3x64xbf16>) outs(%1 : tensor<2x12x12x64xf32>) -> tensor<2x12x12x64xf32>
return %2 : tensor<2x12x12x64xf32>
}

// CHECK: WIP. Still investigating lowering.

13 changes: 13 additions & 0 deletions tests/samples/depthwise_convolution_i32_pipeline_e2e.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
// RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-sources %s | iree-opt --mlir-disable-threading --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-translate-target-executable-variants{target=amd-aie})))" --iree-amdaie-tile-pipeline=conv-decompose --split-input-file --mlir-print-ir-before-all | FileCheck %s


func.func @depthwise_conv_2d_nhwc_hwc(%arg0: tensor<2x14x14x64xi32>, %arg1: tensor<3x3x64xi32>) -> tensor<2x12x12x64xi32> {
%cst = arith.constant 0 : i32
%0 = tensor.empty() : tensor<2x12x12x64xi32>
%1 = linalg.fill ins(%cst : i32) outs(%0 : tensor<2x12x12x64xi32>) -> tensor<2x12x12x64xi32>
%2 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<2x14x14x64xi32>, tensor<3x3x64xi32>) outs(%1 : tensor<2x12x12x64xi32>) -> tensor<2x12x12x64xi32>
return %2 : tensor<2x12x12x64xi32>
}

// CHECK: WIP. Still investigating lowering.

13 changes: 13 additions & 0 deletions tests/samples/depthwise_convolution_i8_pipeline_e2e.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
// RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-sources %s | iree-opt --mlir-disable-threading --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-translate-target-executable-variants{target=amd-aie})))" --iree-amdaie-tile-pipeline=conv-decompose --split-input-file --mlir-print-ir-before-all | FileCheck %s


func.func @depthwise_conv_2d_nhwc_hwc(%arg0: tensor<2x14x14x64xi8>, %arg1: tensor<3x3x64xi8>) -> tensor<2x12x12x64xi32> {
%cst = arith.constant 0 : i32
%0 = tensor.empty() : tensor<2x12x12x64xi32>
%1 = linalg.fill ins(%cst : i32) outs(%0 : tensor<2x12x12x64xi32>) -> tensor<2x12x12x64xi32>
%2 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<2x14x14x64xi8>, tensor<3x3x64xi8>) outs(%1 : tensor<2x12x12x64xi32>) -> tensor<2x12x12x64xi32>
return %2 : tensor<2x12x12x64xi32>
}

// CHECK: WIP. Still investigating lowering.

Loading