diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/CMakeLists.txt index 00af139d3..4fcaa474b 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/CMakeLists.txt @@ -11,7 +11,6 @@ iree_lit_test_suite( "conv2d_nhwc_objectfifo_e2e.mlir" "matmul_elementwise_pack_peel_air_e2e.mlir" "matmul_pack_peel_air_e2e.mlir" - "matmul_pack_peel_objectfifo.mlir" "matmul_pack_peel_objectfifo_e2e.mlir" "matmul_pack_peel_objectfifo_ukernel_e2e.mlir" "matmul_pad_pack_air_e2e.mlir" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo.mlir deleted file mode 100644 index 3365e8e0c..000000000 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo.mlir +++ /dev/null @@ -1,128 +0,0 @@ -// This pipeline is obtained by going into Passes.cpp, and dumping the pass pipeline (at the end of addAMDAIEObjectFifoLoweringPasses) using `passManager.dump()`. This test is included, as it can be useful to have a reference in IR of all the passes that are run. - -// RUN: iree-opt --pass-pipeline="builtin.module(fold-memref-alias-ops,iree-amdaie-distribute-l1-allocations,iree-amdaie-convert-to-dma,iree-amdaie-normalize-loop-bounds,iree-amdaie-insert-cores,iree-amdaie-localize-logicalobjectfifo,cse,iree-amdaie-distribute-cores-and-objectfifos,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-split-logical-objectfifos-for-connection-reuse,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-assign-tiles,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-to-circular-dma,func.func(iree-amdaie-create-aie-workgroup),cse,iree-amdaie-dma-cse,iree-amdaie-hoist-logical-objectfifo,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},iree-amdaie-flatten-logicalobjectfifo,iree-amdaie-assign-logical-objectfifo-depth{l1-buffer-depth=2 l2-buffer-depth=2 l3-buffer-depth=1},iree-amdaie-access-to-acquire-release,iree-amdaie-none-access-to-temporary-buffer,iree-amdaie-assign-connection-types,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-composition{only-zero-stride-on-outer-dim=true},cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-assign-npu-dma-bd-ids,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-controlcode-loop-unroll,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-convert-core-forall-to-for,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-assign-channels,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-objfifo-bufferization,iree-amdaie-connection-to-flow,iree-amdaie-assign-packet-ids,iree-amdaie-npu-dma-to-half-dma-cpy-nd,iree-amdaie-controlcode-lowering,iree-amdaie-controlcode-to-transaction,iree-amdaie-acquire-release-to-use-lock,iree-amdaie-canonicalize-npu-dma-cpy-nd{nb-dimensions=4},canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-sink-into-core,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-lower-to-aie,iree-amdaie-remove-memoryspace)" --split-input-file %s | FileCheck %s - - - -// CHECK-LABEL: aie.device(npu1_4col) -// Check a subset of the tiles: -// CHECK-DAG: %[[TILE_0_2:.+]] = aie.tile(0, 2) -// CHECK-DAG: %[[TILE_1_2:.+]] = aie.tile(1, 2) -// Check a subset of the buffers and locks: -// CHECK-DAG: %[[BUFFER_1_2:.+]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "buff_0"} : memref<1024xi32> -// CHECK-DAG: %[[BUFFER_1_2_0:.+]] = aie.buffer(%[[TILE_1_2]]) {sym_name = "buff_1"} : memref<1024xi32> -// CHECK-DAG: %[[LOCK_1_2:.+]] = aie.lock(%[[TILE_1_2]], 4) {init = 2 : i8, sym_name = "lock_0"} -// CHECK-DAG: %[[LOCK_1_2_1:.+]] = aie.lock(%[[TILE_1_2]], 5) {init = 0 : i8, sym_name = "lock_1"} -// Check a subset of cores: -// CHECK-DAG: aie.core(%[[TILE_0_2]]) -// CHECK: aie.use_lock -// CHECK-DAG: aie.core(%[[TILE_1_2]]) -// CHECK: aie.use_lock -// Check a bit of the aiex.runtime_sequence: -// CHECK: aiex.runtime_sequence @matmul_i32() -// CHECK: } {npu_instructions = dense_resource : tensor<208xui32>, runtime_sequence_name = "matmul_i32"} - -#pipeline_layout = #hal.pipeline.layout, - #hal.pipeline.binding, - #hal.pipeline.binding -]> -#map = affine_map<(d0) -> (d0 * 32)> -#map1 = affine_map<(d0) -> (d0 * 64)> -#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d3, d5)> -#map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)> -#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)> -#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> -module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { - func.func @matmul_i32() { - %c64 = arith.constant 64 : index - %c960 = arith.constant 960 : index - %c0_i32 = arith.constant 0 : i32 - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<1024x64xi32> - memref.assume_alignment %0, 64 : memref<1024x64xi32> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<32x1024xi32> - memref.assume_alignment %1, 64 : memref<32x1024xi32> - %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : memref<32x64xi32> - memref.assume_alignment %2, 64 : memref<32x64xi32> - %alloc = memref.alloc() : memref<4x8x8x8xi32, 2> - %alloc_0 = memref.alloc() : memref<8x8x4x8xi32, 2> - %alloc_1 = memref.alloc() : memref<4x8x4x8xi32, 2> - %alloc_2 = memref.alloc() : memref<32x32xi32, 1> - %alloc_3 = memref.alloc() : memref<64x32xi32, 1> - %alloc_4 = memref.alloc() : memref<32x64xi32, 1> - scf.forall (%arg2, %arg3) in (1, 1) { - %3 = affine.apply #map(%arg2) - %4 = affine.apply #map1(%arg3) - scf.forall (%arg4, %arg5) in (1, 2) { - %5 = affine.apply #map(%arg4) - %6 = affine.apply #map(%arg5) - %subview_5 = memref.subview %1[%5, 0] [32, 64] [1, 1] : memref<32x1024xi32> to memref<32x64xi32, strided<[1024, 1], offset: ?>> - %subview_6 = memref.subview %0[0, %6] [64, 32] [1, 1] : memref<1024x64xi32> to memref<64x32xi32, strided<[64, 1], offset: ?>> - linalg.copy ins(%subview_5 : memref<32x64xi32, strided<[1024, 1], offset: ?>>) outs(%alloc_4 : memref<32x64xi32, 1>) - linalg.copy ins(%subview_6 : memref<64x32xi32, strided<[64, 1], offset: ?>>) outs(%alloc_3 : memref<64x32xi32, 1>) - %subview_7 = memref.subview %alloc_4[0, 0] [32, 64] [1, 1] : memref<32x64xi32, 1> to memref<32x64xi32, strided<[64, 1]>, 1> - %subview_8 = memref.subview %alloc_3[0, 0] [64, 32] [1, 1] : memref<64x32xi32, 1> to memref<64x32xi32, strided<[32, 1]>, 1> - linalg.fill ins(%c0_i32 : i32) outs(%alloc_1 : memref<4x8x4x8xi32, 2>) - iree_linalg_ext.pack %subview_7 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [4, 8] into %alloc_0 : (memref<32x64xi32, strided<[64, 1]>, 1> memref<8x8x4x8xi32, 2>) - iree_linalg_ext.pack %subview_8 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %alloc : (memref<64x32xi32, strided<[32, 1]>, 1> memref<4x8x8x8xi32, 2>) - linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_0, %alloc : memref<8x8x4x8xi32, 2>, memref<4x8x8x8xi32, 2>) outs(%alloc_1 : memref<4x8x4x8xi32, 2>) { - ^bb0(%in: i32, %in_10: i32, %out: i32): - %7 = arith.muli %in, %in_10 : i32 - %8 = arith.addi %out, %7 : i32 - linalg.yield %8 : i32 - } - } {mapping = [#gpu.thread, #gpu.thread]} - scf.for %arg4 = %c64 to %c960 step %c64 { - scf.forall (%arg5, %arg6) in (1, 2) { - %5 = affine.apply #map(%arg5) - %6 = affine.apply #map(%arg6) - %subview_7 = memref.subview %1[%5, %arg4] [32, 64] [1, 1] : memref<32x1024xi32> to memref<32x64xi32, strided<[1024, 1], offset: ?>> - %subview_8 = memref.subview %0[%arg4, %6] [64, 32] [1, 1] : memref<1024x64xi32> to memref<64x32xi32, strided<[64, 1], offset: ?>> - linalg.copy ins(%subview_7 : memref<32x64xi32, strided<[1024, 1], offset: ?>>) outs(%alloc_4 : memref<32x64xi32, 1>) - linalg.copy ins(%subview_8 : memref<64x32xi32, strided<[64, 1], offset: ?>>) outs(%alloc_3 : memref<64x32xi32, 1>) - %subview_9 = memref.subview %alloc_4[0, 0] [32, 64] [1, 1] : memref<32x64xi32, 1> to memref<32x64xi32, strided<[64, 1]>, 1> - %subview_10 = memref.subview %alloc_3[0, 0] [64, 32] [1, 1] : memref<64x32xi32, 1> to memref<64x32xi32, strided<[32, 1]>, 1> - iree_linalg_ext.pack %subview_9 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [4, 8] into %alloc_0 : (memref<32x64xi32, strided<[64, 1]>, 1> memref<8x8x4x8xi32, 2>) - iree_linalg_ext.pack %subview_10 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %alloc : (memref<64x32xi32, strided<[32, 1]>, 1> memref<4x8x8x8xi32, 2>) - linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_0, %alloc : memref<8x8x4x8xi32, 2>, memref<4x8x8x8xi32, 2>) outs(%alloc_1 : memref<4x8x4x8xi32, 2>) { - ^bb0(%in: i32, %in_12: i32, %out: i32): - %7 = arith.muli %in, %in_12 : i32 - %8 = arith.addi %out, %7 : i32 - linalg.yield %8 : i32 - } - } {mapping = [#gpu.thread, #gpu.thread]} - } - scf.forall (%arg5, %arg6) in (1, 2) { - %5 = affine.apply #map(%arg5) - %6 = affine.apply #map(%arg6) - %subview_7 = memref.subview %1[%5, 960] [32, 64] [1, 1] : memref<32x1024xi32> to memref<32x64xi32, strided<[1024, 1], offset: ?>> - %subview_8 = memref.subview %0[960, %6] [64, 32] [1, 1] : memref<1024x64xi32> to memref<64x32xi32, strided<[64, 1], offset: ?>> - linalg.copy ins(%subview_7 : memref<32x64xi32, strided<[1024, 1], offset: ?>>) outs(%alloc_4 : memref<32x64xi32, 1>) - linalg.copy ins(%subview_8 : memref<64x32xi32, strided<[64, 1], offset: ?>>) outs(%alloc_3 : memref<64x32xi32, 1>) - - %subview_9 = memref.subview %alloc_4[0, 0] [32, 64] [1, 1] : memref<32x64xi32, 1> to memref<32x64xi32, strided<[64, 1]>, 1> - %subview_10 = memref.subview %alloc_3[0, 0] [64, 32] [1, 1] : memref<64x32xi32, 1> to memref<64x32xi32, strided<[32, 1]>, 1> - %subview_11 = memref.subview %alloc_2[0, 0] [32, 32] [1, 1] : memref<32x32xi32, 1> to memref<32x32xi32, strided<[32, 1]>, 1> - iree_linalg_ext.pack %subview_9 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [4, 8] into %alloc_0 : (memref<32x64xi32, strided<[64, 1]>, 1> memref<8x8x4x8xi32, 2>) - iree_linalg_ext.pack %subview_10 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %alloc : (memref<64x32xi32, strided<[32, 1]>, 1> memref<4x8x8x8xi32, 2>) - linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_0, %alloc : memref<8x8x4x8xi32, 2>, memref<4x8x8x8xi32, 2>) outs(%alloc_1 : memref<4x8x4x8xi32, 2>) { - ^bb0(%in: i32, %in_12: i32, %out: i32): - %7 = arith.muli %in, %in_12 : i32 - %8 = arith.addi %out, %7 : i32 - linalg.yield %8 : i32 - } - iree_linalg_ext.unpack %alloc_1 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [4, 8] into %subview_11 : (memref<4x8x4x8xi32, 2> memref<32x32xi32, strided<[32, 1]>, 1>) - %subview = memref.subview %2[%5, %6] [32, 32] [1, 1] : memref<32x64xi32> to memref<32x32xi32, strided<[64, 1], offset: ?>> - linalg.copy ins(%alloc_2 : memref<32x32xi32, 1>) outs(%subview : memref<32x32xi32, strided<[64, 1], offset: ?>>) - } {mapping = [#gpu.thread, #gpu.thread]} - } {mapping = [#gpu.block, #gpu.block]} - memref.dealloc %alloc_4 : memref<32x64xi32, 1> - memref.dealloc %alloc_3 : memref<64x32xi32, 1> - memref.dealloc %alloc_2 : memref<32x32xi32, 1> - memref.dealloc %alloc_1 : memref<4x8x4x8xi32, 2> - memref.dealloc %alloc_0 : memref<8x8x4x8xi32, 2> - memref.dealloc %alloc : memref<4x8x8x8xi32, 2> - return - } -}