bump aie and llvm (#575)

Xilinx · May 15, 2024 · 6f03b9a · 6f03b9a
1 parent 630fc9d
commit 6f03b9a
Show file tree

Hide file tree

Showing 23 changed files with 68 additions and 68 deletions.
diff --git a/docs/GEMMCaseStudy.md b/docs/GEMMCaseStudy.md
@@ -32,7 +32,7 @@ The MLIR-AIR compilation pipeline used by the Ryzen AI E2E [board test](https://
 'canonicalize', 'cse'  
 'func.func(air-renumber-dma)'
 'canonicalize', 'cse'  
-['air-to-aie{row-offset=2 col-offset=0 device=npu emit-while-loop=true}'](#air-to-aie)  
+['air-to-aie{row-offset=2 col-offset=0 device=npu1_4col emit-while-loop=true}'](#air-to-aie)  
 'canonicalize'
 ['air-to-std'](#air-to-std)  
 'canonicalize'  
@@ -55,7 +55,7 @@ The MLIR-AIR compilation pipeline used by the Ryzen AI E2E [board test](https://
 |Memtile DMA BD Optimization    |   <br> <ul><li>`air-isolate-async-dma-loop-nests`</li><li>`func.func(air-loop-fusion)`</li><li>`air-specialize-channel-wrap-and-stride`</li></ul>    |   Lowering L2 control flow program into finite-state machines made of Block Descriptors as states. |
 |Double buffering    |   <br> <ul><li>`air-label-scf-for-to-ping-pong`</li><li>`air-ping-pong-transform{keep-memref-dealloc=true}`</li></ul>    |   Detecting and lowering double buffering opportunities by analyzing data production and consumption patterns to a `memref` within an `scf.for` loop; explicitly represent the multiple asynchronous threads traversing through the loop. |
 |Outline air.herd to aie.tiles    |   <br> <ul><li>`func.func(air-collapse-herd{max-col-size=4})`</li><li>`air-place-herds{num-rows=4 num-cols=4 row-anchor=2 col-anchor=0}`</li><li>`func.func(air-renumber-dma)`</li></ul>    |   Reshaping and placing `air.herd` onto `air.segment`; inferring `air.segment` shape and size. |
-|Convert MLIR-AIR to MLIR-AIE    |   <br> <ul><li>`func.func(air-renumber-dma)`</li><li>`air-to-aie{row-offset=2 col-offset=0 device=npu emit-while-loop=true}`</li></ul>    |   Converting to MLIR-AIE dialect. Clone the `func.func` op, where one copy lowers to the circuit design to be mapped onto AIE tiles, and the other copy lowers to LX6 control program; outline `air.herd` body into `aie.core` kernel; materialize asynchronous `air.channel.put/get` into dma block descriptors and `aie.lock`. |
+|Convert MLIR-AIR to MLIR-AIE    |   <br> <ul><li>`func.func(air-renumber-dma)`</li><li>`air-to-aie{row-offset=2 col-offset=0 device=npu1_4col emit-while-loop=true}`</li></ul>    |   Converting to MLIR-AIE dialect. Clone the `func.func` op, where one copy lowers to the circuit design to be mapped onto AIE tiles, and the other copy lowers to LX6 control program; outline `air.herd` body into `aie.core` kernel; materialize asynchronous `air.channel.put/get` into dma block descriptors and `aie.lock`. |
 |SHIM DMA BD Optimization    |   <br> <ul><li>`air-to-std`</li><li>`func.func(affine-loop-opt{affine-opt-tile-sizes=4,4})`</li><li>`func.func(air-unroll-outer-affine-loops{depth=2})`</li><li>`airrt-to-npu`</li></ul>    |   Converting the control code via AIRRt and AIEX.NPU dialect to NPU SHIM DMA instruction sequence. |
 ||||||
 
@@ -1247,7 +1247,7 @@ Converts the MLIR-AIR dialect code into AIRRt dialect which represents the runti
 *Input IR:*
 ```
 module {
-  aie.device(npu) {
+  aie.device(npu1_4col) {
     ...
     aie.shim_dma_allocation @airMemcpyId78(S2MM, 0, 0)
     memref.global "public" @airMemcpyId78 : memref<32x128xi32, 1>
@@ -1281,7 +1281,7 @@ The input IR contains some `air.channel.put` and `air.channel.get` memory operat
 *Output IR:*
 ```
 module {
-  aie.device(npu) {
+  aie.device(npu1_4col) {
     ...
     aie.shim_dma_allocation @airMemcpyId78(S2MM, 0, 0)
     memref.global "public" @airMemcpyId78 : memref<32x128xi32, 1>
@@ -1423,7 +1423,7 @@ Converts the runtime program, described in AIRRt dialect, into instruction seque
 *Input IR:*
 ```
 module {
-  aie.device(npu) {
+  aie.device(npu1_4col) {
     ...
     aie.shim_dma_allocation @airMemcpyId78(S2MM, 0, 0)
     memref.global "public" @airMemcpyId78 : memref<32x128xi32, 1>
@@ -1458,7 +1458,7 @@ The input IR contains some L3 memory operations (`airrt.dma_memcpy_nd`) optional
 *Output IR:*
 ```
 module {
-  aie.device(npu) {
+  aie.device(npu1_4col) {
     ...
     aie.shim_dma_allocation @airMemcpyId78(S2MM, 0, 0)
     memref.global "public" @airMemcpyId78 : memref<32x128xi32, 1>

diff --git a/examples/air_to_npu/aie.py b/examples/air_to_npu/aie.py
@@ -130,7 +130,7 @@ def forward(lhs, rhs):
     ################################################
 
     pipeline = "builtin.module("+",".join([
-        'air-to-aie{row-offset=2 col-offset=0 device=npu emit-while-loop=true}',
+        'air-to-aie{row-offset=2 col-offset=0 device=npu1_4col emit-while-loop=true}',
         'canonicalize',
     ])+')'
     pm = air.passmanager.PassManager.parse(pipeline)

diff --git a/mlir/include/air/Conversion/Passes.td b/mlir/include/air/Conversion/Passes.td
@@ -386,7 +386,7 @@ def AIRRtToNpu : Pass<"airrt-to-npu", "ModuleOp"> {
     Input:
     ```mlir
     module {
-      aie.device(npu) {
+      aie.device(npu1_1col) {
         ...
         aie.shim_dma_allocation @airMemcpyId78(S2MM, 0, 0)
         memref.global "public" @airMemcpyId78 : memref<32x128xi32, 1>
@@ -420,7 +420,7 @@ def AIRRtToNpu : Pass<"airrt-to-npu", "ModuleOp"> {
     Output:
     ```mlir
     module {
-      aie.device(npu) {
+      aie.device(npu1_1col) {
         ...
         aie.shim_dma_allocation @airMemcpyId78(S2MM, 0, 0)
         memref.global "public" @airMemcpyId78 : memref<32x128xi32, 1>

diff --git a/mlir/test/Conversion/AIRLowering/air_to_npu.mlir b/mlir/test/Conversion/AIRLowering/air_to_npu.mlir
@@ -7,7 +7,7 @@
 
 // RUN: air-opt %s -air-to-std -canonicalize -cse --split-input-file | FileCheck %s
 
-// CHECK-LABEL: aie.device(npu)
+// CHECK-LABEL: aie.device(npu1_1col)
 // CHECK: {sym_name = "segment0"}
 // CHECK: func.func @func0(%[[VAL_0:.*]]: memref<64xi32>, %[[VAL_1:.*]]: memref<64xi32>)
 // CHECK-DAG: %[[CST_0:.*]] = arith.constant 0 : i64
@@ -20,7 +20,7 @@
 // CHECK: airrt.dma_memcpy_nd(%[[CST_7]], %[[CST_0]], %[[CST_0]], %[[VAL_1]][%[[CST_0]], %[[CST_0]], %[[CST_0]], %[[CST_0]]], [%[[CST_1]], %[[CST_1]], %[[CST_1]], %[[CST_64]]], [%[[CST_0]], %[[CST_0]], %[[CST_0]]]) {metadata = @airMemcpyId7} : (i32, i64, i64, memref<64xi32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64])
 
 module {
-  aie.device(npu) {
+  aie.device(npu1_1col) {
     aie.shim_dma_allocation @airMemcpyId7(S2MM, 0, 0)
     memref.global "public" @airMemcpyId7 : memref<64xi32, 1>
     aie.shim_dma_allocation @airMemcpyId2(MM2S, 0, 0)
@@ -68,7 +68,7 @@ module {
 
 // Asynchronous version
 
-// CHECK-LABEL: aie.device(npu)
+// CHECK-LABEL: aie.device(npu1_1col)
 // CHECK: {sym_name = "segment0"}
 // CHECK: func.func @func1(%[[VAL_0:.*]]: memref<64xi32>, %[[VAL_1:.*]]: memref<64xi32>)
 // CHECK-DAG: %[[CST_0:.*]] = arith.constant 0 : i64
@@ -81,7 +81,7 @@ module {
 // CHECK: airrt.dma_memcpy_nd(%[[CST_7]], %[[CST_0]], %[[CST_0]], %[[VAL_1]][%[[CST_0]], %[[CST_0]], %[[CST_0]], %[[CST_0]]], [%[[CST_1]], %[[CST_1]], %[[CST_1]], %[[CST_64]]], [%[[CST_0]], %[[CST_0]], %[[CST_0]]]) {metadata = @airMemcpyId7} : (i32, i64, i64, memref<64xi32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64])
 
 module {
-  aie.device(npu) {
+  aie.device(npu1_1col) {
     aie.shim_dma_allocation @airMemcpyId7(S2MM, 0, 0)
     memref.global "public" @airMemcpyId7 : memref<64xi32, 1>
     aie.shim_dma_allocation @airMemcpyId2(MM2S, 0, 0)

diff --git a/mlir/test/Conversion/AIRRtToNpu/airrt_to_npu.mlir b/mlir/test/Conversion/AIRRtToNpu/airrt_to_npu.mlir
@@ -9,7 +9,7 @@
 
 // Synchronous airrt.dma_memcpy_nd
 
-// CHECK-LABEL: aie.device(npu)
+// CHECK-LABEL: aie.device(npu1_1col)
 // CHECK: aie.shim_dma_allocation @airMemcpyId7(S2MM, 0, 0)
 // CHECK: memref.global "public" @airMemcpyId7 : memref<64xi32, 1>
 // CHECK: aie.shim_dma_allocation @airMemcpyId2(MM2S, 0, 0)
@@ -23,7 +23,7 @@
 // CHECK: {sym_name = "segment0"}
 
 module {
-  aie.device(npu) {
+  aie.device(npu1_1col) {
     aie.shim_dma_allocation @airMemcpyId7(S2MM, 0, 0)
     memref.global "public" @airMemcpyId7 : memref<64xi32, 1>
     aie.shim_dma_allocation @airMemcpyId2(MM2S, 0, 0)
@@ -50,7 +50,7 @@ module {
 
 // Asynchronous airrt.dma_memcpy_nd
 
-// CHECK-LABEL: aie.device(npu) {
+// CHECK-LABEL: aie.device(npu1_1col) {
 // CHECK: aie.shim_dma_allocation @airMemcpyId7(S2MM, 0, 0)
 // CHECK: memref.global "public" @airMemcpyId7 : memref<64xi32, 1>
 // CHECK: aie.shim_dma_allocation @airMemcpyId2(MM2S, 0, 0)
@@ -64,7 +64,7 @@ module {
 // CHECK: } {sym_name = "segment0"}
 
 module {
-  aie.device(npu) {
+  aie.device(npu1_1col) {
     aie.shim_dma_allocation @airMemcpyId7(S2MM, 0, 0)
     memref.global "public" @airMemcpyId7 : memref<64xi32, 1>
     aie.shim_dma_allocation @airMemcpyId2(MM2S, 0, 0)
@@ -94,7 +94,7 @@ module {
 
 // air.launch iteration space unrolling
 
-// CHECK-LABEL: aie.device(npu) {
+// CHECK-LABEL: aie.device(npu1_1col) {
 // CHECK: aie.shim_dma_allocation @airMemcpyId16(S2MM, 0, 0)
 // CHECK: memref.global "public" @airMemcpyId16 : memref<32x32xi32, 1>
 // CHECK: aie.shim_dma_allocation @airMemcpyId5(MM2S, 0, 0)
@@ -115,7 +115,7 @@ module {
 
 #map = affine_map<()[s0] -> (s0 * 32)>
 module {
-  aie.device(npu) {
+  aie.device(npu1_1col) {
     aie.shim_dma_allocation @airMemcpyId16(S2MM, 0, 0)
     memref.global "public" @airMemcpyId16 : memref<32x32xi32, 1>
     aie.shim_dma_allocation @airMemcpyId5(MM2S, 0, 0)
@@ -158,13 +158,13 @@ module {
 
 // air.launch iteration space unrolling 2
 
-// CHECK-LABEL: aie.device(npu) {
+// CHECK-LABEL: aie.device(npu1_2col) {
 // CHECK:  func.func @func3(%[[ARG0:.*]]: memref<8x8xi32>)
 // CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][2, 2, 4, 4][32, 4, 8]) {id = 0 : i64, metadata = @airMemcpyId14} : memref<8x8xi32>
 
 #map = affine_map<()[s0] -> (s0 * 4)>
 module {
-  aie.device(npu) {
+  aie.device(npu1_2col) {
     aie.shim_dma_allocation @airMemcpyId14(S2MM, 0, 0)
     memref.global "public" @airMemcpyId14 : memref<4x4xi32, 2>
     aie.shim_dma_allocation @airMemcpyId14_1(S2MM, 1, 0)
@@ -206,13 +206,13 @@ module {
 
 // objectfifo lowering
 
-// CHECK-LABEL: aie.device(npu)
+// CHECK-LABEL: aie.device(npu1_2col)
 // CHECK:  func.func @func4(%[[ARG0:.*]]: memref<8x8xi32>)
 // CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][2, 2, 4, 4][32, 4, 8]) {id = 0 : i64, metadata = @air_channel_1} : memref<8x8xi32>
 
 #map = affine_map<()[s0] -> (s0 * 4)>
 module {
-  aie.device(npu) {
+  aie.device(npu1_2col) {
     %tile_0_3 = aie.tile(0, 3)
     %tile_1_3 = aie.tile(1, 3)
     %tile_0_4 = aie.tile(0, 4)
@@ -256,7 +256,7 @@ module {
 
 // Unroll repeat pattern
 
-// CHECK-LABEL: aie.device(npu)
+// CHECK-LABEL: aie.device(npu1_1col)
 // CHECK:  func.func @func5(%[[ARG0:.*]]: memref<8x8xi32>, %[[ARG1:.*]]: memref<8x8xi32>, %[[ARG2:.*]]: memref<8x8xi32>)
 // CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][2, 1, 4, 8][0, 0, 8]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<8x8xi32>
 // CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 4, 0][2, 1, 4, 8][0, 0, 8]) {id = 1 : i64, metadata = @airMemcpyId4} : memref<8x8xi32>
@@ -265,7 +265,7 @@ module {
 
 #map = affine_map<()[s0] -> (s0 * 4)>
 module {
-  aie.device(npu) {
+  aie.device(npu1_1col) {
     %tile_0_0 = aie.tile(0, 0)
     %tile_0_1 = aie.tile(0, 1)
     %tile_0_2 = aie.tile(0, 2)
@@ -311,7 +311,7 @@ module {
 
 // Populate repeat dimension (highest dimension)
 
-// CHECK-LABEL: aie.device(npu)
+// CHECK-LABEL: aie.device(npu1_1col)
 // CHECK:  func.func @func6(%[[ARG0:.*]]: memref<8x16xi32>, %[[ARG1:.*]]: memref<16x32xi32>, %[[ARG2:.*]]: memref<8x32xi32>)
 // CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][2, 1, 8, 16][0, 0, 16]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<8x16xi32>
 // CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG1]][0, 0, 0, 0][1, 2, 16, 16][0, 16, 32]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<16x32xi32>
@@ -320,7 +320,7 @@ module {
 #map = affine_map<()[s0] -> (s0 * 8)>
 #map1 = affine_map<()[s0] -> (s0 * 16)>
 module {
-  aie.device(npu) {
+  aie.device(npu1_1col) {
     %tile_0_0 = aie.tile(0, 0)
     aie.shim_dma_allocation @airMemcpyId12(S2MM, 0, 0)
     memref.global "public" @airMemcpyId12 : memref<1x1x8x16xi32, 1>
@@ -362,7 +362,7 @@ module {
 
 // Unroll repeat pattern + populate repeat dimension
 
-// CHECK-LABEL: aie.device(npu)
+// CHECK-LABEL: aie.device(npu1_1col)
 // CHECK:  func.func @func7(%[[ARG0:.*]]: memref<2048x512xi32>, %[[ARG1:.*]]: memref<512x2048xi32>, %[[ARG2:.*]]: memref<2048x2048xi32>)
 // CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][4, 8, 64, 64][0, 64, 512]) {id = 0 : i64, metadata = @airMemcpyId20} : memref<2048x512xi32>
 // CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 64, 0][4, 8, 64, 64][0, 64, 512]) {id = 1 : i64, metadata = @airMemcpyId20} : memref<2048x512xi32>
@@ -373,7 +373,7 @@ module {
 
 #map = affine_map<()[s0] -> (s0 * 64)>
 module {
-  aie.device(npu) {
+  aie.device(npu1_1col) {
     %tile_0_0 = aie.tile(0, 0)
     aie.shim_dma_allocation @airMemcpyId26(S2MM, 0, 0)
     memref.global "public" @airMemcpyId26 : memref<64x64xi32, 1>
@@ -429,7 +429,7 @@ module {
 // CHECK: aiex.npu.dma_memcpy_nd
 // CHECK: aiex.npu.sync
 module {
-  aie.device(npu) {
+  aie.device(npu1_1col) {
     aie.shim_dma_allocation @airMemcpyId7(S2MM, 0, 0)
     memref.global "public" @airMemcpyId7 : memref<64xi32, 1>
   } {sym_name = "herd"}
@@ -449,7 +449,7 @@ module {
 
 // Dealing with scenarios where wrap dimension in airrt.dma_memcpy_nd goes beyond the [0, 1023] hardware limit.
 
-// CHECK-LABEL: aie.device(npu)
+// CHECK-LABEL: aie.device(npu1_1col)
 // CHECK:  func.func @func9(%[[ARG0:.*]]: memref<2048x2048xi32>, %[[ARG1:.*]]: memref<2048x2048xi32>, %[[ARG2:.*]]: memref<2048x2048xi32>)
 // CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][4, 8, 64, 256][0, 256, 2048]) {id = 0 : i64, metadata = @airMemcpyId20} : memref<2048x2048xi32>
 // CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 64, 0][4, 8, 64, 256][0, 256, 2048]) {id = 1 : i64, metadata = @airMemcpyId20} : memref<2048x2048xi32>
@@ -463,7 +463,7 @@ module {
 
 #map = affine_map<()[s0] -> (s0 * 64)>
 module {
-  aie.device(npu) {
+  aie.device(npu1_1col) {
     %tile_0_0 = aie.tile(0, 0)
     aie.shim_dma_allocation @airMemcpyId26(S2MM, 0, 0)
     memref.global "public" @airMemcpyId26 : memref<64x64xi32, 1>
@@ -519,15 +519,15 @@ module {
 
 // Dealing with scenarios where wrap dimension in airrt.dma_memcpy_nd goes beyond the [0, 1023] hardware limit (test case 2).
 
-// CHECK-LABEL: aie.device(npu)
+// CHECK-LABEL: aie.device(npu1_1col)
 // CHECK:  func.func @func10(%[[ARG0:.*]]: memref<2654208xi32>)
 // CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][3, 768, 3, 32][128, 3456, 1152]) {id = 0 : i64, metadata = @airMemcpyId21} : memref<2654208xi32>
 // CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][3, 768, 3, 32][128, 3456, 1152]) {id = 1 : i64, metadata = @airMemcpyId21} : memref<2654208xi32>
 // CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][3, 768, 3, 32][128, 3456, 1152]) {id = 2 : i64, metadata = @airMemcpyId21} : memref<2654208xi32>
 
 #map = affine_map<()[s0] -> (s0 * 64)>
 module {
-  aie.device(npu) {
+  aie.device(npu1_1col) {
     %tile_0_0 = aie.tile(0, 0)
     aie.shim_dma_allocation @airMemcpyId21(MM2S, 0, 2)
     memref.global "public" @airMemcpyId21 : memref<256x64xbf16, 1>
@@ -567,7 +567,7 @@ module {
 // CHECK-SAME: %arg0: memref<8192xi32>
 // CHECK-NEXT: aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][4, 4, 32, 16][2048, 16, 64]){{.*}}: memref<8192xi32>
 module {
-  aie.device(npu) {
+  aie.device(npu1_1col) {
     func.func @func11(%arg0: memref<128x128xbf16>, %arg1: memref<128x128xbf16>) {
       %c0_i32 = arith.constant 0 : i32
       %c0_i64 = arith.constant 0 : i64
@@ -717,7 +717,7 @@ module {
 #map2 = affine_map<()[s0] -> (s0 * 256 + 128)>
 #map3 = affine_map<()[s0] -> (s0 * 256 + 192)>
 module {
-  aie.device(npu) {
+  aie.device(npu1_1col) {
     aie.shim_dma_allocation @airMemcpyId45(S2MM, 0, 0)
     memref.global "public" @airMemcpyId45 : memref<256x256xbf16, 1>
     aie.shim_dma_allocation @airMemcpyId46(S2MM, 1, 0)
@@ -813,7 +813,7 @@ module {
 
 // Avoid folding for loop into wrap-and-stride, if the outcome is stride > 1M; unroll BDs instead.
 
-// CHECK-LABEL: aie.device(npu)
+// CHECK-LABEL: aie.device(npu1_1col)
 // CHECK:  func.func @func18(%[[ARG0:.*]]: memref<8192x32768xi32>)
 // CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][1, 4, 64, 64][0, 64, 32768]) {id = 0 : i64, metadata = @airMemcpyId26} : memref<8192x32768xi32>
 // CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 64, 0][1, 4, 64, 64][0, 64, 32768]) {id = 1 : i64, metadata = @airMemcpyId26} : memref<8192x32768xi32>
@@ -822,7 +822,7 @@ module {
 
 #map = affine_map<()[s0] -> (s0 * 64)>
 module {
-  aie.device(npu) {
+  aie.device(npu1_1col) {
     %tile_0_0 = aie.tile(0, 0)
     aie.shim_dma_allocation @airMemcpyId26(S2MM, 0, 0)
     memref.global "public" @airMemcpyId26 : memref<64x64xi32, 1>
@@ -858,13 +858,13 @@ module {
 
 // Big memref.
 
-// CHECK-LABEL: aie.device(npu)
+// CHECK-LABEL: aie.device(npu1_1col)
 // CHECK:  func.func @func19(%[[ARG0:.*]]: memref<308x2432xi32>)
 // CHECK:  aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][4, 19, 28, 128][0, 128, 2432]) {id = 0 : i64, metadata = @airMemcpyId26} : memref<308x2432xi32>
 
 #map = affine_map<()[s0] -> (s0 * 64)>
 module {
-  aie.device(npu) {
+  aie.device(npu1_1col) {
     %tile_0_0 = aie.tile(0, 0)
     aie.shim_dma_allocation @airMemcpyId26(S2MM, 0, 0)
     memref.global "public" @airMemcpyId26 : memref<64x64xi32, 1>