Depthwise convolution (no new pipeline) #565

newling · 2024-07-17T15:31:33Z

WIP: need to resolve lowering issues. Example IR:

// -----// IR Dump Before AMDAIEBridgeToAIR (iree-amdaie-bridge-to-air) //----- //
module {
  func.func @depthwise_conv_2d_nhwc_hwc_dispatch_0_depthwise_conv_2d_nhwc_hwc_2x12x12x64x3x3_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<Custom>} {
    %cst = arith.constant 0.000000e+00 : bf16
    %c1 = arith.constant 1 : index
    %c3 = arith.constant 3 : index
    %cst_0 = arith.constant 0.000000e+00 : f32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2x14x14x64xbf16>
    memref.assume_alignment %0, 64 : memref<2x14x14x64xbf16>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<3x3x64xbf16>
    memref.assume_alignment %1, 64 : memref<3x3x64xbf16>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<2x12x12x64xf32>
    memref.assume_alignment %2, 64 : memref<2x12x12x64xf32>
    scf.forall (%arg0, %arg1, %arg2, %arg3) = (0, 0, 0, 0) to (2, 12, 12, 64) step (1, 4, 4, 16) {
      %subview = memref.subview %0[%arg0, %arg1, %arg2, %arg3] [1, 6, 6, 16] [1, 1, 1, 1] : memref<2x14x14x64xbf16> to memref<1x6x6x16xbf16, strided<[12544, 896, 64, 1], offset: ?>>
      %subview_1 = memref.subview %1[0, 0, %arg3] [3, 3, 16] [1, 1, 1] : memref<3x3x64xbf16> to memref<3x3x16xbf16, strided<[192, 64, 1], offset: ?>>
      %subview_2 = memref.subview %2[%arg0, %arg1, %arg2, %arg3] [1, 4, 4, 16] [1, 1, 1, 1] : memref<2x12x12x64xf32> to memref<1x4x4x16xf32, strided<[9216, 768, 64, 1], offset: ?>>
      %alloc = memref.alloc() : memref<1x6x6x16xbf16, 1 : i32>
      linalg.copy ins(%subview : memref<1x6x6x16xbf16, strided<[12544, 896, 64, 1], offset: ?>>) outs(%alloc : memref<1x6x6x16xbf16, 1 : i32>)
      %alloc_3 = memref.alloc() : memref<3x3x16xbf16, 1 : i32>
      linalg.copy ins(%subview_1 : memref<3x3x16xbf16, strided<[192, 64, 1], offset: ?>>) outs(%alloc_3 : memref<3x3x16xbf16, 1 : i32>)
      %alloc_4 = memref.alloc() : memref<1x4x4x16xf32, 1 : i32>
      scf.forall (%arg4, %arg5) = (0, 0) to (4, 16) step (1, 4) {
        %subview_5 = memref.subview %alloc_4[0, %arg4, 0, %arg5] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x4x4x16xf32, 1 : i32> to memref<1x1x4x4xf32, strided<[256, 64, 16, 1], offset: ?>, 1 : i32>
        %alloc_6 = memref.alloc() : memref<1x1x4x4xf32, 2 : i32>
        linalg.fill ins(%cst_0 : f32) outs(%alloc_6 : memref<1x1x4x4xf32, 2 : i32>)
        scf.for %arg6 = %c0 to %c3 step %c1 {
          scf.for %arg7 = %c0 to %c3 step %c1 {
            %3 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg4, %arg6]
            %subview_7 = memref.subview %alloc[0, %3, %arg7, %arg5] [1, 1, 4, 4] [1, 1, 1, 1] : memref<1x6x6x16xbf16, 1 : i32> to memref<1x1x4x4xbf16, strided<[576, 96, 16, 1], offset: ?>, 1 : i32>
            %subview_8 = memref.subview %alloc_3[%arg6, %arg7, %arg5] [1, 1, 4] [1, 1, 1] : memref<3x3x16xbf16, 1 : i32> to memref<1x1x4xbf16, strided<[48, 16, 1], offset: ?>, 1 : i32>
            %alloc_9 = memref.alloc() : memref<1x1x4x4xbf16, 2 : i32>
            linalg.copy ins(%subview_7 : memref<1x1x4x4xbf16, strided<[576, 96, 16, 1], offset: ?>, 1 : i32>) outs(%alloc_9 : memref<1x1x4x4xbf16, 2 : i32>)
            %alloc_10 = memref.alloc() : memref<1x1x4xbf16, 2 : i32>
            linalg.copy ins(%subview_8 : memref<1x1x4xbf16, strided<[48, 16, 1], offset: ?>, 1 : i32>) outs(%alloc_10 : memref<1x1x4xbf16, 2 : i32>)
            %4 = vector.transfer_read %alloc_9[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x1x4x4xbf16, 2 : i32>, vector<1x4x4xbf16>
            %5 = vector.transfer_read %alloc_10[%c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<1x1x4xbf16, 2 : i32>, vector<1x4xbf16>
            %6 = vector.transfer_read %alloc_6[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x1x4x4xf32, 2 : i32>, vector<1x4x4xf32>
            %7 = vector.extract %5[0] : vector<4xbf16> from vector<1x4xbf16>
            %8 = arith.extf %4 : vector<1x4x4xbf16> to vector<1x4x4xf32>
            %9 = arith.extf %7 : vector<4xbf16> to vector<4xf32>
            %10 = vector.broadcast %9 : vector<4xf32> to vector<1x4x4xf32>
            %11 = vector.fma %8, %10, %6 : vector<1x4x4xf32>
            vector.transfer_write %11, %alloc_6[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x4x4xf32>, memref<1x1x4x4xf32, 2 : i32>
            memref.dealloc %alloc_9 : memref<1x1x4x4xbf16, 2 : i32>
            memref.dealloc %alloc_10 : memref<1x1x4xbf16, 2 : i32>
          }
        }
        linalg.copy ins(%alloc_6 : memref<1x1x4x4xf32, 2 : i32>) outs(%subview_5 : memref<1x1x4x4xf32, strided<[256, 64, 16, 1], offset: ?>, 1 : i32>)
        memref.dealloc %alloc_6 : memref<1x1x4x4xf32, 2 : i32>
      }
      linalg.copy ins(%alloc_4 : memref<1x4x4x16xf32, 1 : i32>) outs(%subview_2 : memref<1x4x4x16xf32, strided<[9216, 768, 64, 1], offset: ?>>)
      memref.dealloc %alloc : memref<1x6x6x16xbf16, 1 : i32>
      memref.dealloc %alloc_3 : memref<3x3x16xbf16, 1 : i32>
      memref.dealloc %alloc_4 : memref<1x4x4x16xf32, 1 : i32>
    }
    return
  }
}

yzhang93 · 2024-07-17T19:29:43Z

Does aievec support vector.fma?

newling · 2024-07-17T20:03:56Z

Does aievec support vector.fma?

I've asked @jsetoain about this IR, hoping for some feedback tomorrow

jsetoain · 2024-07-18T07:02:12Z

Does aievec support vector.fma?

I was going to say that aievec doesn't need to support vector.fma, but Peano can't handle its natural lowering at the moment. I'll wedge-in a workaround while we wait for them to catch up.

yzhang93 · 2024-07-18T17:14:34Z

compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.cpp

+    tileSizeLevel0 = {
+        /* N */ 0,        /* output width */ 4,  /* output height */ 1,
+        /* channel */ 16, /* kernel height */ 0, /* kernel width */ 0};
+
+    // Inner-most scf.forall tiling. The iteration space corresponds to
+    // individual AIE cores. TODO(newling)
+    // 1) check that the core array of shape Nx(16/4) is valid. N columns?
+    tileSizeLevel1 = {
+        /* N */ 1,       /* output width */ 0,  /* output height */ 0,
+        /* channel */ 4, /* kernel height */ 0, /* kernel width */ 0};


Similar to Conv2d channel last, I think we should first try tile sizes as

tileSizeLevel0 = {0, 4, OW, OC, 0, 0};
tileSizeLevel1 = {1, 1, OW, OC, 0, 0};
tileSizeLevel2 = {0, 0, 0, 0, 1, 1};

If the batch size = 1, we could use 4x4 array, which means OW (OC) in the first level can be 4OW (4OC).

jsetoain · 2024-07-18T18:39:31Z

Both #1626 and #1627 have landed, that enables vector.fma lowering to Peano.

erwei-xilinx · 2024-07-19T23:45:50Z

With current mlir-air head (8727a53ccd5ad771deb15b404d7a81db07d4c7ba) depthwise with i32 inputs and outputs shall pass the test.

yzhang93 · 2024-07-20T00:13:19Z

@jsetoain for bf16->f32 vectorization, I got error

iree-compile: iree-amd-aie/compiler/plugins/target/AMD-AIE/aievec/AIEVecToLLVM.cpp:75: mlir::Value mlir::iree_compiler::aievec::forceCastValueToType(mlir::OpBuilder &, mlir::Location, mlir::Value, mlir::Type): Assertion `srcVecTy.getRank() == 1 && dstVecTy.getRank() == 1 && "only flat 1D vectors can be force casted"' failed.

The IR snippet

scf.forall (%arg3, %arg4) in (2, 4) {
    %subview_5 = memref.subview %alloc_4[%arg3, %arg4, 0, 0] [1, 1, 4, 4] [1, 1, 1, 1] : memref<2x4x4x4xf32, 1 : i32> to memref<1x1x4x4xf32, strided<[64, 16, 4, 1], offset: ?>, 1 : i32>
    %alloc_6 = memref.alloc() : memref<1x1x4x4xf32, 2 : i32>
    linalg.fill ins(%cst_0 : f32) outs(%alloc_6 : memref<1x1x4x4xf32, 2 : i32>)
    scf.for %arg5 = %c0 to %c3 step %c1 {
      scf.for %arg6 = %c0 to %c3 step %c1 {
        %3 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg4, %arg5]
        %subview_7 = memref.subview %alloc[%arg3, %3, %arg6, 0] [1, 1, 4, 4] [1, 1, 1, 1] : memref<2x6x6x4xbf16, 1 : i32> to memref<1x1x4x4xbf16, strided<[144, 24, 4, 1], offset: ?>, 1 : i32>
        %subview_8 = memref.subview %alloc_3[%arg5, %arg6, 0] [1, 1, 4] [1, 1, 1] : memref<3x3x4xbf16, 1 : i32> to memref<1x1x4xbf16, strided<[12, 4, 1], offset: ?>, 1 : i32>
        %alloc_9 = memref.alloc() : memref<1x1x4x4xbf16, 2 : i32>
        linalg.copy ins(%subview_7 : memref<1x1x4x4xbf16, strided<[144, 24, 4, 1], offset: ?>, 1 : i32>) outs(%alloc_9 : memref<1x1x4x4xbf16, 2 : i32>)
        %alloc_10 = memref.alloc() : memref<1x1x4xbf16, 2 : i32>
        linalg.copy ins(%subview_8 : memref<1x1x4xbf16, strided<[12, 4, 1], offset: ?>, 1 : i32>) outs(%alloc_10 : memref<1x1x4xbf16, 2 : i32>)
        %4 = vector.transfer_read %alloc_9[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x1x4x4xbf16, 2 : i32>, vector<1x4x4xbf16>
        %5 = vector.transfer_read %alloc_10[%c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<1x1x4xbf16, 2 : i32>, vector<1x4xbf16>
        %6 = vector.transfer_read %alloc_6[%c0, %c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : memref<1x1x4x4xf32, 2 : i32>, vector<1x4x4xf32>
        %7 = vector.extract %5[0] : vector<4xbf16> from vector<1x4xbf16>
        %8 = arith.extf %4 : vector<1x4x4xbf16> to vector<1x4x4xf32>
        %9 = arith.extf %7 : vector<4xbf16> to vector<4xf32>
        %10 = vector.broadcast %9 : vector<4xf32> to vector<1x4x4xf32>
        %11 = vector.fma %8, %10, %6 : vector<1x4x4xf32>
        vector.transfer_write %11, %alloc_6[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x4x4xf32>, memref<1x1x4x4xf32, 2 : i32>
        memref.dealloc %alloc_9 : memref<1x1x4x4xbf16, 2 : i32>
        memref.dealloc %alloc_10 : memref<1x1x4xbf16, 2 : i32>
      }
    }
    linalg.copy ins(%alloc_6 : memref<1x1x4x4xf32, 2 : i32>) outs(%subview_5 : memref<1x1x4x4xf32, strided<[64, 16, 4, 1], offset: ?>, 1 : i32>)
    memref.dealloc %alloc_6 : memref<1x1x4x4xf32, 2 : i32>
  }

jsetoain · 2024-07-22T06:56:56Z

@jsetoain for bf16->f32 vectorization, I got error

iree-compile: iree-amd-aie/compiler/plugins/target/AMD-AIE/aievec/AIEVecToLLVM.cpp:75: mlir::Value mlir::iree_compiler::aievec::forceCastValueToType(mlir::OpBuilder &, mlir::Location, mlir::Value, mlir::Type): Assertion `srcVecTy.getRank() == 1 && dstVecTy.getRank() == 1 && "only flat 1D vectors can be force casted"' failed.

I don't understand why you're getting that error. When I try to lower your code snippet, I find issues with this:

        %7 = vector.extract %5[0] : vector<4xbf16> from vector<1x4xbf16>
        %8 = arith.extf %4 : vector<1x4x4xbf16> to vector<1x4x4xf32>
        %9 = arith.extf %7 : vector<4xbf16> to vector<4xf32>
        %10 = vector.broadcast %9 : vector<4xf32> to vector<1x4x4xf32>
        %11 = vector.fma %8, %10, %6 : vector<1x4x4xf32>

I'm not entirely sure how the broadcast ends up after the widening, but it should happen right after the extract, and then you can expect both of them (extract+broadcast) to lower to an aievec intrinsic, and extf+fma to another.

That said, the extract+broadcast aren't being replaced, which makes me suspect the pattern is failing to match. It's a very old pattern, and we weren't using multidimensional vectors when it was written, so it's probably related to that. That should be an easy fix, but in the meantime I'd have to understand why those widening operations aren't directly grouped with the fma. My working theoretical frame is, "mixed precision" ops are represented as single type operations as ext+op combinations, so those have to go together.

jsetoain · 2024-07-22T09:51:37Z

Update: I've noticed an easy-to-fix issue (vector::BroadcastOp isn't being marked as illegal, I don't know why), and a bit of a complication. You're broadcasting a small vector into a larger vector, and we don't support that since it's a more complex op. I'll work on adding that, but it will take a bit longer.

newling · 2024-07-25T23:16:46Z

Below are the IR evolutions for bf16->f32 and i8->i32 depthwise convolutions.

Compilation 1:

With this tiling strategy, there is an elementwise multiply-add on 16 elements (which for i8 I think is incorrect for AIE?)
trace_i8_depthwise.txt
The final error (in llc) is

LLVM ERROR: unable to legalize instruction: %135:_(<16 x s32>) = 
G_SHUFFLE_VECTOR %134:_(<16 x s32>), %132:_, shufflemask(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19) 
(in function: core_0_2)

which I think is because there was a problem earlier in lowering.

Compilation 2:

This does the elementwise multiply-add on 32 elements (which for i8 I think is correct for AIE?)
trace_i8_depthwise_2.txt
Here we get

Assertion `srcVecTy.getRank() == 1 && dstVecTy.getRank() == 1 && "only flat 1D vectors can be force casted`

in aievec-to-llvm.

Compilation 3:

This is for bf16... added here for completeness but I'd like to focus on i8 for now.
trace_bf16_depthwise.txt
here we get :

error: failed to legalize operation 'vector.fma' that was explicitly marked illegal

@jsetoain can you please take a look at the file attached (compilation 2 I think is most relevant) and see if the lowering through aievec looks sensible? Note that is possible that there is something which Maks didn't port in c1f4984 (I hope we don't have to be doing this porting for too long!)

newling requested review from MaheshRavishankar, nirvedhmeshram, yzhang93, Abhishek-Varma and jtuyls as code owners July 17, 2024 15:31

newling mentioned this pull request Jul 17, 2024

Initial support for depthwise convolution #562

Closed

yzhang93 reviewed Jul 18, 2024

View reviewed changes

newling force-pushed the depthwise_no_new_pipeline branch from feca6f4 to 9cca226 Compare July 23, 2024 15:18

newling added 3 commits July 25, 2024 12:10

first commit

17385b9

add i8 conv

d679f05

bifurcate

23ff9ef

newling force-pushed the depthwise_no_new_pipeline branch from c1a7d10 to 23ff9ef Compare July 25, 2024 19:38

add i32 version

8e87700

newling closed this Aug 29, 2024

newling deleted the depthwise_no_new_pipeline branch August 29, 2024 18:24

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Depthwise convolution (no new pipeline) #565

Depthwise convolution (no new pipeline) #565

newling commented Jul 17, 2024 •

edited

Loading

yzhang93 commented Jul 17, 2024

newling commented Jul 17, 2024

jsetoain commented Jul 18, 2024

yzhang93 Jul 18, 2024

jsetoain commented Jul 18, 2024

erwei-xilinx commented Jul 19, 2024

yzhang93 commented Jul 20, 2024 •

edited

Loading

jsetoain commented Jul 22, 2024

jsetoain commented Jul 22, 2024 •

edited

Loading

newling commented Jul 25, 2024

Depthwise convolution (no new pipeline) #565

Depthwise convolution (no new pipeline) #565

Conversation

newling commented Jul 17, 2024 • edited Loading

yzhang93 commented Jul 17, 2024

newling commented Jul 17, 2024

jsetoain commented Jul 18, 2024

yzhang93 Jul 18, 2024

Choose a reason for hiding this comment

jsetoain commented Jul 18, 2024

erwei-xilinx commented Jul 19, 2024

yzhang93 commented Jul 20, 2024 • edited Loading

jsetoain commented Jul 22, 2024

jsetoain commented Jul 22, 2024 • edited Loading

newling commented Jul 25, 2024

Compilation 1:

Compilation 2:

Compilation 3:

newling commented Jul 17, 2024 •

edited

Loading

yzhang93 commented Jul 20, 2024 •

edited

Loading

jsetoain commented Jul 22, 2024 •

edited

Loading