From 17385b95e2bd24a892f84bfd41c1eb56e1f64cfe Mon Sep 17 00:00:00 2001
From: James Newling <james.newling@gmail.com>
Date: Thu, 25 Jul 2024 12:09:10 -0700
Subject: [PATCH 1/4] first commit

---
 .../Transforms/AMDAIETileAndFuse.cpp          |  8 ++-
 .../Transforms/KernelDispatch.cpp             | 53 +++++++++++++++++--
 .../depthwise_convolution_pipeline_e2e.mlir   | 12 +++++
 3 files changed, 69 insertions(+), 4 deletions(-)
 create mode 100644 tests/samples/depthwise_convolution_pipeline_e2e.mlir

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp
index 439e7d1d7..4cfa4b089 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp
@@ -8,6 +8,7 @@
 #include "iree/compiler/Codegen/Utils/Utils.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h"
 #include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"
 #include "mlir/Dialect/Linalg/Utils/Utils.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
@@ -167,8 +168,13 @@ void AMDAIETileAndFusePass::runOnOperation() {
     // Conv2d ops, because there could be four parallel tiling dimensions.
     // Somehow `linalg::isaConvolutionOpInterface()` doesn't work properly.
     // TODO (vivian): create AIE specific mapping attributes.
+    //
+    // TODO (newling) check if we can use
+    // "!isa<linalg::ConvolutionOpInterface>(consumerOp.getOperation()))"
+    //
     if (!isa<linalg::Conv2DNhwcHwcfOp, linalg::Conv2DNchwFchwOp,
-             linalg::Conv2DNhwcHwcfQOp>(consumerOp)) {
+             linalg::Conv2DNhwcHwcfQOp, linalg::DepthwiseConv2DNhwcHwcOp>(
+            consumerOp)) {
       if (tilingLevel == 0) {
         options.setMapping(
             {gpu::GPUBlockMappingAttr::get(context, gpu::MappingId::DimY),
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.cpp
index ba9520d82..693b7070d 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.cpp
@@ -10,6 +10,7 @@
 #include "iree-amd-aie/Transforms/AMDAIEUtils.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
 #include "iree/compiler/Codegen/Utils/CPUUtils.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/MemRef/Transforms/Transforms.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
@@ -465,6 +466,51 @@ static LogicalResult setRootConfigForConvDecomposePipeline(
     tileSizeLevel0 = {0, OC, 4, OW, 0, 0, 0};
     tileSizeLevel1 = {1, OC, 1, OW, 0, 0, 0};
     tileSizeLevel2 = {0, 0, 0, 0, IC, 1, 1};
+  } else if (isa<linalg::DepthwiseConv2DNhwcHwcOp>(linalgOp)) {
+    // Notes:
+    // =====
+    // Below we target a 4x4 array of AIE cores.
+    //
+    // An inherent property of depthwise convolutions is that they cannot be
+    // expressed in terms of matmuls, unlike the above (dense) conv-2ds. The
+    // tile sizes we choose below are therefore not constrained by the AIE
+    // matmul instructions.
+    //
+    // The logic is currently fragile, and there are no guardrails: there are
+    // no checks that the data tiles are not too large, or that the input
+    // dimensions are perfectly tiled by the hard-coded tile dimensions below.
+    // These will be done as a follow-up task.
+
+    // Outer most scf.forall tiling. Defines the sizes of data tiles in L2
+    // (shared memory). Specifically, with the values selected below
+    // - the output data has 4 x 4 x 16 elements.
+    // - the input data has 4 x (4 + kh) x (1 + kw) x 16 elements
+    // - the kernel has kh x kw x 16 elements.
+    // TODO(newling)
+    // 1) check that the output tile perfectly tiles the input image
+    // 2) check that the sum of the L2 (memtile) allocations is within memory
+    //    budget
+    tileSizeLevel0 = {
+        /* N */ 1,        /* output height */ 4, /* output width */ 4,
+        /* channel */ 16, /* kernel width */ 0,  /* kernel height */ 0};
+
+    // Inner-most scf.forall tiling. The iteration space corresponds to
+    // individual AIE cores.
+    tileSizeLevel1 = {
+        /* N */ 0,       /* output height */ 1, /* output width */ 0,
+        /* channel */ 4, /* kernel width */ 0,  /* kernel height */ 0};
+
+    // The scf.for loops that each core runs. The inner-most scf.for loop
+    // contains L1 allocations, which are copied to L2 at every iteration
+    // of the inner-most scf.for loop. These tile sizes define L1 allocation
+    // sizes. With the current design, we iterate over all kh x kw dimension,
+    // and perform a elementwise multiplication between a 4x4 tensor (from the
+    // input image) and a vector of size 4 (broadcast to 4x4).
+    tileSizeLevel2 = {
+        /* N */ 0,       /* output height */ 0, /* output width */ 0,
+        /* channel */ 0, /* kernel width */ 1,  /* kernel height */ 1};
+  } else {
+    assert(false && "Support must be added for this convolution op");
   }
   TileSizesListType tileSizes = {tileSizeLevel0, tileSizeLevel1,
                                  tileSizeLevel2};
@@ -641,9 +687,10 @@ static LogicalResult setRootConfigImpl(mlir::FunctionOpInterface entryPointFn,
         // add support for them, this way we can verify our work.
         // TODO (vivian): add support for other conv interface ops
         .Case<linalg::Conv2DNhwcHwcfOp, linalg::Conv2DNchwFchwOp,
-              linalg::Conv2DNhwcHwcfQOp>([&](auto op) {
-          return setConvRootConfig(entryPointFn, op, passPipeline, cfg);
-        })
+              linalg::Conv2DNhwcHwcfQOp, linalg::DepthwiseConv2DNhwcHwcOp>(
+            [&](auto op) {
+              return setConvRootConfig(entryPointFn, op, passPipeline, cfg);
+            })
         .Case<linalg::GenericOp>([&](auto op) {
           return setRootConfig(entryPointFn, op, passPipeline, cfg);
         })
diff --git a/tests/samples/depthwise_convolution_pipeline_e2e.mlir b/tests/samples/depthwise_convolution_pipeline_e2e.mlir
new file mode 100644
index 000000000..1851f3b8d
--- /dev/null
+++ b/tests/samples/depthwise_convolution_pipeline_e2e.mlir
@@ -0,0 +1,12 @@
+// RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-sources %s | iree-opt --mlir-disable-threading  --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-translate-target-executable-variants{target=amd-aie})))" --iree-amdaie-tile-pipeline=conv-decompose --split-input-file --mlir-print-ir-before=iree-amdaie-bridge-to-air | FileCheck %s
+
+func.func @depthwise_conv_2d_nhwc_hwc(%arg0: tensor<2x14x14x64xbf16>, %arg1: tensor<3x3x64xbf16>) -> tensor<2x12x12x64xf32> {
+  %cst = arith.constant 0.0 : f32
+  %0 = tensor.empty() : tensor<2x12x12x64xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<2x12x12x64xf32>) -> tensor<2x12x12x64xf32>
+  %2 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<2x14x14x64xbf16>, tensor<3x3x64xbf16>) outs(%1 : tensor<2x12x12x64xf32>) -> tensor<2x12x12x64xf32>
+  return %2 : tensor<2x12x12x64xf32>
+}
+
+// CHECK: WIP. Still investigating lowering. 
+

From d679f05fde748421f4850f7068d8b7c31a7199b9 Mon Sep 17 00:00:00 2001
From: James Newling <james.newling@gmail.com>
Date: Thu, 25 Jul 2024 12:38:03 -0700
Subject: [PATCH 2/4] add i8 conv

---
 .../depthwise_convolution_bf16_pipeline_e2e.mlir    | 12 ++++++++++++
 .../depthwise_convolution_i8_pipeline_e2e.mlir      | 13 +++++++++++++
 2 files changed, 25 insertions(+)
 create mode 100644 tests/samples/depthwise_convolution_bf16_pipeline_e2e.mlir
 create mode 100644 tests/samples/depthwise_convolution_i8_pipeline_e2e.mlir

diff --git a/tests/samples/depthwise_convolution_bf16_pipeline_e2e.mlir b/tests/samples/depthwise_convolution_bf16_pipeline_e2e.mlir
new file mode 100644
index 000000000..413efbfc2
--- /dev/null
+++ b/tests/samples/depthwise_convolution_bf16_pipeline_e2e.mlir
@@ -0,0 +1,12 @@
+// RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-sources %s | iree-opt --mlir-disable-threading  --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-translate-target-executable-variants{target=amd-aie})))" --iree-amdaie-tile-pipeline=conv-decompose --split-input-file --mlir-print-ir-before-all | FileCheck %s
+
+func.func @depthwise_conv_2d_nhwc_hwc(%arg0: tensor<2x14x14x64xbf16>, %arg1: tensor<3x3x64xbf16>) -> tensor<2x12x12x64xf32> {
+  %cst = arith.constant 0.0 : f32
+  %0 = tensor.empty() : tensor<2x12x12x64xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<2x12x12x64xf32>) -> tensor<2x12x12x64xf32>
+  %2 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<2x14x14x64xbf16>, tensor<3x3x64xbf16>) outs(%1 : tensor<2x12x12x64xf32>) -> tensor<2x12x12x64xf32>
+  return %2 : tensor<2x12x12x64xf32>
+}
+
+// CHECK: WIP. Still investigating lowering. 
+
diff --git a/tests/samples/depthwise_convolution_i8_pipeline_e2e.mlir b/tests/samples/depthwise_convolution_i8_pipeline_e2e.mlir
new file mode 100644
index 000000000..1cbe6eab8
--- /dev/null
+++ b/tests/samples/depthwise_convolution_i8_pipeline_e2e.mlir
@@ -0,0 +1,13 @@
+// RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-sources %s | iree-opt --mlir-disable-threading  --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-translate-target-executable-variants{target=amd-aie})))" --iree-amdaie-tile-pipeline=conv-decompose --split-input-file --mlir-print-ir-before-all | FileCheck %s
+
+
+func.func @depthwise_conv_2d_nhwc_hwc(%arg0: tensor<2x14x14x64xi8>, %arg1: tensor<3x3x64xi8>) -> tensor<2x12x12x64xi32> {
+  %cst = arith.constant 0 : i32
+  %0 = tensor.empty() : tensor<2x12x12x64xi32>
+  %1 = linalg.fill ins(%cst : i32) outs(%0 : tensor<2x12x12x64xi32>) -> tensor<2x12x12x64xi32>
+  %2 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<2x14x14x64xi8>, tensor<3x3x64xi8>) outs(%1 : tensor<2x12x12x64xi32>) -> tensor<2x12x12x64xi32>
+  return %2 : tensor<2x12x12x64xi32>
+}
+
+// CHECK: WIP. Still investigating lowering. 
+

From 23ff9efe60eeca7d42a6475e888e8ae2b3547c90 Mon Sep 17 00:00:00 2001
From: James Newling <james.newling@gmail.com>
Date: Thu, 25 Jul 2024 12:38:22 -0700
Subject: [PATCH 3/4] bifurcate

---
 .../samples/depthwise_convolution_pipeline_e2e.mlir  | 12 ------------
 1 file changed, 12 deletions(-)
 delete mode 100644 tests/samples/depthwise_convolution_pipeline_e2e.mlir

diff --git a/tests/samples/depthwise_convolution_pipeline_e2e.mlir b/tests/samples/depthwise_convolution_pipeline_e2e.mlir
deleted file mode 100644
index 1851f3b8d..000000000
--- a/tests/samples/depthwise_convolution_pipeline_e2e.mlir
+++ /dev/null
@@ -1,12 +0,0 @@
-// RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-sources %s | iree-opt --mlir-disable-threading  --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-translate-target-executable-variants{target=amd-aie})))" --iree-amdaie-tile-pipeline=conv-decompose --split-input-file --mlir-print-ir-before=iree-amdaie-bridge-to-air | FileCheck %s
-
-func.func @depthwise_conv_2d_nhwc_hwc(%arg0: tensor<2x14x14x64xbf16>, %arg1: tensor<3x3x64xbf16>) -> tensor<2x12x12x64xf32> {
-  %cst = arith.constant 0.0 : f32
-  %0 = tensor.empty() : tensor<2x12x12x64xf32>
-  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<2x12x12x64xf32>) -> tensor<2x12x12x64xf32>
-  %2 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<2x14x14x64xbf16>, tensor<3x3x64xbf16>) outs(%1 : tensor<2x12x12x64xf32>) -> tensor<2x12x12x64xf32>
-  return %2 : tensor<2x12x12x64xf32>
-}
-
-// CHECK: WIP. Still investigating lowering. 
-

From 8e877009e73e00cfe4ac479ac154bb7425e54e71 Mon Sep 17 00:00:00 2001
From: James Newling <james.newling@gmail.com>
Date: Thu, 25 Jul 2024 12:41:58 -0700
Subject: [PATCH 4/4] add i32 version

---
 .../depthwise_convolution_i32_pipeline_e2e.mlir     | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100644 tests/samples/depthwise_convolution_i32_pipeline_e2e.mlir

diff --git a/tests/samples/depthwise_convolution_i32_pipeline_e2e.mlir b/tests/samples/depthwise_convolution_i32_pipeline_e2e.mlir
new file mode 100644
index 000000000..2bc560c57
--- /dev/null
+++ b/tests/samples/depthwise_convolution_i32_pipeline_e2e.mlir
@@ -0,0 +1,13 @@
+// RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-sources %s | iree-opt --mlir-disable-threading  --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-translate-target-executable-variants{target=amd-aie})))" --iree-amdaie-tile-pipeline=conv-decompose --split-input-file --mlir-print-ir-before-all | FileCheck %s
+
+
+func.func @depthwise_conv_2d_nhwc_hwc(%arg0: tensor<2x14x14x64xi32>, %arg1: tensor<3x3x64xi32>) -> tensor<2x12x12x64xi32> {
+  %cst = arith.constant 0 : i32
+  %0 = tensor.empty() : tensor<2x12x12x64xi32>
+  %1 = linalg.fill ins(%cst : i32) outs(%0 : tensor<2x12x12x64xi32>) -> tensor<2x12x12x64xi32>
+  %2 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<2x14x14x64xi32>, tensor<3x3x64xi32>) outs(%1 : tensor<2x12x12x64xi32>) -> tensor<2x12x12x64xi32>
+  return %2 : tensor<2x12x12x64xi32>
+}
+
+// CHECK: WIP. Still investigating lowering. 
+