From 17385b95e2bd24a892f84bfd41c1eb56e1f64cfe Mon Sep 17 00:00:00 2001 From: James Newling Date: Thu, 25 Jul 2024 12:09:10 -0700 Subject: [PATCH 1/4] first commit --- .../Transforms/AMDAIETileAndFuse.cpp | 8 ++- .../Transforms/KernelDispatch.cpp | 53 +++++++++++++++++-- .../depthwise_convolution_pipeline_e2e.mlir | 12 +++++ 3 files changed, 69 insertions(+), 4 deletions(-) create mode 100644 tests/samples/depthwise_convolution_pipeline_e2e.mlir diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp index 439e7d1d7..4cfa4b089 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp @@ -8,6 +8,7 @@ #include "iree/compiler/Codegen/Utils/Utils.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/Linalg/IR/Linalg.h" +#include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h" #include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h" #include "mlir/Dialect/Linalg/Utils/Utils.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" @@ -167,8 +168,13 @@ void AMDAIETileAndFusePass::runOnOperation() { // Conv2d ops, because there could be four parallel tiling dimensions. // Somehow `linalg::isaConvolutionOpInterface()` doesn't work properly. // TODO (vivian): create AIE specific mapping attributes. + // + // TODO (newling) check if we can use + // "!isa(consumerOp.getOperation()))" + // if (!isa(consumerOp)) { + linalg::Conv2DNhwcHwcfQOp, linalg::DepthwiseConv2DNhwcHwcOp>( + consumerOp)) { if (tilingLevel == 0) { options.setMapping( {gpu::GPUBlockMappingAttr::get(context, gpu::MappingId::DimY), diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.cpp index ba9520d82..693b7070d 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.cpp @@ -10,6 +10,7 @@ #include "iree-amd-aie/Transforms/AMDAIEUtils.h" #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h" #include "iree/compiler/Codegen/Utils/CPUUtils.h" +#include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/MemRef/Transforms/Transforms.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" @@ -465,6 +466,51 @@ static LogicalResult setRootConfigForConvDecomposePipeline( tileSizeLevel0 = {0, OC, 4, OW, 0, 0, 0}; tileSizeLevel1 = {1, OC, 1, OW, 0, 0, 0}; tileSizeLevel2 = {0, 0, 0, 0, IC, 1, 1}; + } else if (isa(linalgOp)) { + // Notes: + // ===== + // Below we target a 4x4 array of AIE cores. + // + // An inherent property of depthwise convolutions is that they cannot be + // expressed in terms of matmuls, unlike the above (dense) conv-2ds. The + // tile sizes we choose below are therefore not constrained by the AIE + // matmul instructions. + // + // The logic is currently fragile, and there are no guardrails: there are + // no checks that the data tiles are not too large, or that the input + // dimensions are perfectly tiled by the hard-coded tile dimensions below. + // These will be done as a follow-up task. + + // Outer most scf.forall tiling. Defines the sizes of data tiles in L2 + // (shared memory). Specifically, with the values selected below + // - the output data has 4 x 4 x 16 elements. + // - the input data has 4 x (4 + kh) x (1 + kw) x 16 elements + // - the kernel has kh x kw x 16 elements. + // TODO(newling) + // 1) check that the output tile perfectly tiles the input image + // 2) check that the sum of the L2 (memtile) allocations is within memory + // budget + tileSizeLevel0 = { + /* N */ 1, /* output height */ 4, /* output width */ 4, + /* channel */ 16, /* kernel width */ 0, /* kernel height */ 0}; + + // Inner-most scf.forall tiling. The iteration space corresponds to + // individual AIE cores. + tileSizeLevel1 = { + /* N */ 0, /* output height */ 1, /* output width */ 0, + /* channel */ 4, /* kernel width */ 0, /* kernel height */ 0}; + + // The scf.for loops that each core runs. The inner-most scf.for loop + // contains L1 allocations, which are copied to L2 at every iteration + // of the inner-most scf.for loop. These tile sizes define L1 allocation + // sizes. With the current design, we iterate over all kh x kw dimension, + // and perform a elementwise multiplication between a 4x4 tensor (from the + // input image) and a vector of size 4 (broadcast to 4x4). + tileSizeLevel2 = { + /* N */ 0, /* output height */ 0, /* output width */ 0, + /* channel */ 0, /* kernel width */ 1, /* kernel height */ 1}; + } else { + assert(false && "Support must be added for this convolution op"); } TileSizesListType tileSizes = {tileSizeLevel0, tileSizeLevel1, tileSizeLevel2}; @@ -641,9 +687,10 @@ static LogicalResult setRootConfigImpl(mlir::FunctionOpInterface entryPointFn, // add support for them, this way we can verify our work. // TODO (vivian): add support for other conv interface ops .Case([&](auto op) { - return setConvRootConfig(entryPointFn, op, passPipeline, cfg); - }) + linalg::Conv2DNhwcHwcfQOp, linalg::DepthwiseConv2DNhwcHwcOp>( + [&](auto op) { + return setConvRootConfig(entryPointFn, op, passPipeline, cfg); + }) .Case([&](auto op) { return setRootConfig(entryPointFn, op, passPipeline, cfg); }) diff --git a/tests/samples/depthwise_convolution_pipeline_e2e.mlir b/tests/samples/depthwise_convolution_pipeline_e2e.mlir new file mode 100644 index 000000000..1851f3b8d --- /dev/null +++ b/tests/samples/depthwise_convolution_pipeline_e2e.mlir @@ -0,0 +1,12 @@ +// RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-sources %s | iree-opt --mlir-disable-threading --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-translate-target-executable-variants{target=amd-aie})))" --iree-amdaie-tile-pipeline=conv-decompose --split-input-file --mlir-print-ir-before=iree-amdaie-bridge-to-air | FileCheck %s + +func.func @depthwise_conv_2d_nhwc_hwc(%arg0: tensor<2x14x14x64xbf16>, %arg1: tensor<3x3x64xbf16>) -> tensor<2x12x12x64xf32> { + %cst = arith.constant 0.0 : f32 + %0 = tensor.empty() : tensor<2x12x12x64xf32> + %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<2x12x12x64xf32>) -> tensor<2x12x12x64xf32> + %2 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<2x14x14x64xbf16>, tensor<3x3x64xbf16>) outs(%1 : tensor<2x12x12x64xf32>) -> tensor<2x12x12x64xf32> + return %2 : tensor<2x12x12x64xf32> +} + +// CHECK: WIP. Still investigating lowering. + From d679f05fde748421f4850f7068d8b7c31a7199b9 Mon Sep 17 00:00:00 2001 From: James Newling Date: Thu, 25 Jul 2024 12:38:03 -0700 Subject: [PATCH 2/4] add i8 conv --- .../depthwise_convolution_bf16_pipeline_e2e.mlir | 12 ++++++++++++ .../depthwise_convolution_i8_pipeline_e2e.mlir | 13 +++++++++++++ 2 files changed, 25 insertions(+) create mode 100644 tests/samples/depthwise_convolution_bf16_pipeline_e2e.mlir create mode 100644 tests/samples/depthwise_convolution_i8_pipeline_e2e.mlir diff --git a/tests/samples/depthwise_convolution_bf16_pipeline_e2e.mlir b/tests/samples/depthwise_convolution_bf16_pipeline_e2e.mlir new file mode 100644 index 000000000..413efbfc2 --- /dev/null +++ b/tests/samples/depthwise_convolution_bf16_pipeline_e2e.mlir @@ -0,0 +1,12 @@ +// RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-sources %s | iree-opt --mlir-disable-threading --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-translate-target-executable-variants{target=amd-aie})))" --iree-amdaie-tile-pipeline=conv-decompose --split-input-file --mlir-print-ir-before-all | FileCheck %s + +func.func @depthwise_conv_2d_nhwc_hwc(%arg0: tensor<2x14x14x64xbf16>, %arg1: tensor<3x3x64xbf16>) -> tensor<2x12x12x64xf32> { + %cst = arith.constant 0.0 : f32 + %0 = tensor.empty() : tensor<2x12x12x64xf32> + %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<2x12x12x64xf32>) -> tensor<2x12x12x64xf32> + %2 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<2x14x14x64xbf16>, tensor<3x3x64xbf16>) outs(%1 : tensor<2x12x12x64xf32>) -> tensor<2x12x12x64xf32> + return %2 : tensor<2x12x12x64xf32> +} + +// CHECK: WIP. Still investigating lowering. + diff --git a/tests/samples/depthwise_convolution_i8_pipeline_e2e.mlir b/tests/samples/depthwise_convolution_i8_pipeline_e2e.mlir new file mode 100644 index 000000000..1cbe6eab8 --- /dev/null +++ b/tests/samples/depthwise_convolution_i8_pipeline_e2e.mlir @@ -0,0 +1,13 @@ +// RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-sources %s | iree-opt --mlir-disable-threading --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-translate-target-executable-variants{target=amd-aie})))" --iree-amdaie-tile-pipeline=conv-decompose --split-input-file --mlir-print-ir-before-all | FileCheck %s + + +func.func @depthwise_conv_2d_nhwc_hwc(%arg0: tensor<2x14x14x64xi8>, %arg1: tensor<3x3x64xi8>) -> tensor<2x12x12x64xi32> { + %cst = arith.constant 0 : i32 + %0 = tensor.empty() : tensor<2x12x12x64xi32> + %1 = linalg.fill ins(%cst : i32) outs(%0 : tensor<2x12x12x64xi32>) -> tensor<2x12x12x64xi32> + %2 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<2x14x14x64xi8>, tensor<3x3x64xi8>) outs(%1 : tensor<2x12x12x64xi32>) -> tensor<2x12x12x64xi32> + return %2 : tensor<2x12x12x64xi32> +} + +// CHECK: WIP. Still investigating lowering. + From 23ff9efe60eeca7d42a6475e888e8ae2b3547c90 Mon Sep 17 00:00:00 2001 From: James Newling Date: Thu, 25 Jul 2024 12:38:22 -0700 Subject: [PATCH 3/4] bifurcate --- .../samples/depthwise_convolution_pipeline_e2e.mlir | 12 ------------ 1 file changed, 12 deletions(-) delete mode 100644 tests/samples/depthwise_convolution_pipeline_e2e.mlir diff --git a/tests/samples/depthwise_convolution_pipeline_e2e.mlir b/tests/samples/depthwise_convolution_pipeline_e2e.mlir deleted file mode 100644 index 1851f3b8d..000000000 --- a/tests/samples/depthwise_convolution_pipeline_e2e.mlir +++ /dev/null @@ -1,12 +0,0 @@ -// RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-sources %s | iree-opt --mlir-disable-threading --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-translate-target-executable-variants{target=amd-aie})))" --iree-amdaie-tile-pipeline=conv-decompose --split-input-file --mlir-print-ir-before=iree-amdaie-bridge-to-air | FileCheck %s - -func.func @depthwise_conv_2d_nhwc_hwc(%arg0: tensor<2x14x14x64xbf16>, %arg1: tensor<3x3x64xbf16>) -> tensor<2x12x12x64xf32> { - %cst = arith.constant 0.0 : f32 - %0 = tensor.empty() : tensor<2x12x12x64xf32> - %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<2x12x12x64xf32>) -> tensor<2x12x12x64xf32> - %2 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<2x14x14x64xbf16>, tensor<3x3x64xbf16>) outs(%1 : tensor<2x12x12x64xf32>) -> tensor<2x12x12x64xf32> - return %2 : tensor<2x12x12x64xf32> -} - -// CHECK: WIP. Still investigating lowering. - From 8e877009e73e00cfe4ac479ac154bb7425e54e71 Mon Sep 17 00:00:00 2001 From: James Newling Date: Thu, 25 Jul 2024 12:41:58 -0700 Subject: [PATCH 4/4] add i32 version --- .../depthwise_convolution_i32_pipeline_e2e.mlir | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 tests/samples/depthwise_convolution_i32_pipeline_e2e.mlir diff --git a/tests/samples/depthwise_convolution_i32_pipeline_e2e.mlir b/tests/samples/depthwise_convolution_i32_pipeline_e2e.mlir new file mode 100644 index 000000000..2bc560c57 --- /dev/null +++ b/tests/samples/depthwise_convolution_i32_pipeline_e2e.mlir @@ -0,0 +1,13 @@ +// RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-sources %s | iree-opt --mlir-disable-threading --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-translate-target-executable-variants{target=amd-aie})))" --iree-amdaie-tile-pipeline=conv-decompose --split-input-file --mlir-print-ir-before-all | FileCheck %s + + +func.func @depthwise_conv_2d_nhwc_hwc(%arg0: tensor<2x14x14x64xi32>, %arg1: tensor<3x3x64xi32>) -> tensor<2x12x12x64xi32> { + %cst = arith.constant 0 : i32 + %0 = tensor.empty() : tensor<2x12x12x64xi32> + %1 = linalg.fill ins(%cst : i32) outs(%0 : tensor<2x12x12x64xi32>) -> tensor<2x12x12x64xi32> + %2 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<2x14x14x64xi32>, tensor<3x3x64xi32>) outs(%1 : tensor<2x12x12x64xi32>) -> tensor<2x12x12x64xi32> + return %2 : tensor<2x12x12x64xi32> +} + +// CHECK: WIP. Still investigating lowering. +