From fbdcad320857d3a534ea6d955279f0d0f4c3a99c Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Tue, 16 Jul 2024 19:05:56 -0600
Subject: [PATCH] Access L2 Segment Allocation in Herd with Python Example
 (#668)

---
 .../worker_to_self/worker_to_self.py          |  62 +++++-----
 programming_examples/segment_alloc/Makefile   |  12 ++
 programming_examples/segment_alloc/run.py     |  88 ++++++++++++++
 .../segment_alloc/run_makefile.lit            |   8 ++
 .../segment_alloc/segment_alloc.py            | 109 ++++++++++++++++++
 python/air/dialects/_air_ops_ext.py           |  29 ++++-
 6 files changed, 272 insertions(+), 36 deletions(-)
 create mode 100644 programming_examples/segment_alloc/Makefile
 create mode 100644 programming_examples/segment_alloc/run.py
 create mode 100644 programming_examples/segment_alloc/run_makefile.lit
 create mode 100644 programming_examples/segment_alloc/segment_alloc.py

diff --git a/programming_examples/channel_examples/worker_to_self/worker_to_self.py b/programming_examples/channel_examples/worker_to_self/worker_to_self.py
index 8060fe363..ee9f68293 100644
--- a/programming_examples/channel_examples/worker_to_self/worker_to_self.py
+++ b/programming_examples/channel_examples/worker_to_self/worker_to_self.py
@@ -23,14 +23,18 @@ def build_module():
     ChannelOp("ChanOut")
     ChannelOp("ToSelf")
 
-    # We want to store our data in L1 memory
-    mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1)
+    mem_space_l1 = IntegerAttr.get(T.i32(), MemorySpace.L1)
+    image_type_l1 = MemRefType.get(
+        shape=IMAGE_SIZE,
+        element_type=T.i32(),
+        memory_space=mem_space_l1,
+    )
 
-    # This is the type definition of the image
-    image_type = MemRefType.get(
+    mem_space_l2 = IntegerAttr.get(T.i32(), MemorySpace.L2)
+    image_type_l2 = MemRefType.get(
         shape=IMAGE_SIZE,
         element_type=T.i32(),
-        memory_space=mem_space,
+        memory_space=mem_space_l2,
     )
 
     # We will send an image worth of data in and out
@@ -47,51 +51,43 @@ def launch_body(a, b):
             @segment(name="seg")
             def segment_body():
 
+                tensor_in_l2 = AllocOp(image_type_l2, [], [])
+                ChannelGet("ChanIn", tensor_in_l2)
+
                 # The herd sizes correspond to the dimensions of the contiguous block of cores we are hoping to get.
                 # We just need one compute core, so we ask for a 1x1 herd
-                @herd(name="copyherd", sizes=[1, 1])
-                def herd_body(tx, ty, sx, sy):
+                @herd(
+                    name="copyherd",
+                    sizes=[1, 1],
+                    operands=[tensor_in_l2],
+                )
+                def herd_body(tx, ty, sx, sy, tensor_in_l2):
 
                     # We must allocate a buffer of image size for the input/output
-                    tensor_in = AllocOp(image_type, [], [])
-                    tensor_out = AllocOp(image_type, [], [])
-                    tensor_in2 = AllocOp(image_type, [], [])
-                    tensor_out2 = AllocOp(image_type, [], [])
+                    tensor_in_l1 = AllocOp(image_type_l1, [], [])
+                    tensor_out_l1 = AllocOp(image_type_l1, [], [])
 
-                    ChannelGet("ChanIn", tensor_in)
+                    ChannelPut("ToSelf", tensor_in_l2)
+                    ChannelGet("ToSelf", tensor_in_l1)
 
                     # Access every value in the tile
                     for j in range_(IMAGE_HEIGHT):
                         for i in range_(IMAGE_WIDTH):
                             # Load the input value from tile_in
-                            val = load(tensor_in, [i, j])
+                            val = load(tensor_in_l1, [i, j])
 
                             # Store the output value in tile_out
-                            store(val, tensor_out, [i, j])
+                            store(val, tensor_out_l1, [i, j])
                             yield_([])
                         yield_([])
 
-                    ChannelPut("ToSelf", tensor_out)
-                    ChannelGet("ToSelf", tensor_in2)
-
-                    # Access every value in the tile
-                    for j in range_(IMAGE_HEIGHT):
-                        for i in range_(IMAGE_WIDTH):
-                            # Load the input value from tile_in
-                            val = load(tensor_in2, [i, j])
-
-                            # Store the output value in tile_out
-                            store(val, tensor_out2, [i, j])
-                            yield_([])
-                        yield_([])
-
-                    ChannelPut("ChanOut", tensor_out2)
+                    ChannelPut("ChanOut", tensor_out_l1)
 
                     # Deallocate our L1 buffers
-                    DeallocOp(tensor_in)
-                    DeallocOp(tensor_out)
-                    DeallocOp(tensor_in2)
-                    DeallocOp(tensor_out2)
+                    DeallocOp(tensor_in_l1)
+                    DeallocOp(tensor_out_l1)
+
+                DeallocOp(tensor_in_l2)
 
 
 if __name__ == "__main__":
diff --git a/programming_examples/segment_alloc/Makefile b/programming_examples/segment_alloc/Makefile
new file mode 100644
index 000000000..e25a18738
--- /dev/null
+++ b/programming_examples/segment_alloc/Makefile
@@ -0,0 +1,12 @@
+# (c) Copyright 2024 Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: MIT
+srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
+
+targetname := $(shell basename ${srcdir})
+
+run:
+	mkdir -p build
+	cd build && ${powershell} python3 ${srcdir}/run.py
+
+clean:
+	rm -rf build __pycache__
\ No newline at end of file
diff --git a/programming_examples/segment_alloc/run.py b/programming_examples/segment_alloc/run.py
new file mode 100644
index 000000000..332ae67b6
--- /dev/null
+++ b/programming_examples/segment_alloc/run.py
@@ -0,0 +1,88 @@
+# run.py -*- Python -*-
+#
+# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: MIT
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: MIT
+
+import argparse
+import numpy as np
+import air.backend.xrt as xrt_backend
+import filelock
+
+from segment_alloc import *
+
+INOUT_DATATYPE = np.uint32
+INOUT_ELEM_SIZE = np.dtype(INOUT_DATATYPE).itemsize
+INOUT_SIZE = IMAGE_SIZE[0] * IMAGE_SIZE[1]
+INOUT_SIZE_BYTES = INOUT_SIZE * INOUT_ELEM_SIZE
+
+
+def main(verbose=False, experimental_passes=False):
+    mlir_module = build_module()
+
+    input_a = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
+    output_b = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
+    for i in range(INOUT_SIZE):
+        input_a[i] = i + 0x1000
+        output_b[i] = 0x00DEFACED
+
+    backend = xrt_backend.XRTBackend(
+        verbose=verbose,
+        experimental_passes=experimental_passes,
+        omit_while_true_loop=True,
+    )
+
+    # run the module
+    with filelock.FileLock("/tmp/npu.lock"):
+        mul = backend.compile_and_load(mlir_module)
+        (_, output_b) = mul(input_a, output_b)
+
+    backend.unload()
+
+    # check output, should have the top left filled in
+    errors = 0
+    for i in range(INOUT_SIZE):
+        rb = output_b[i]
+
+        row = i / IMAGE_WIDTH
+        col = i % IMAGE_WIDTH
+
+        if row < TILE_HEIGHT and col < TILE_WIDTH:
+            # value should have been updated
+            if not (rb == 0x1000 + i):
+                print(f"IM {i} [{col}, {row}] should be 0x{i:x}, is 0x{rb:x}\n")
+                errors += 1
+        else:
+            # value should stay unchanged
+            if rb != 0x00DEFACED:
+                print(
+                    f"IM {i} [{col}, {row}] should be 0xdefaced, is 0x{rb:x}\n",
+                    i,
+                    col,
+                    row,
+                    rb,
+                )
+                errors += 1
+
+    if errors == 0:
+        print("PASS!")
+        exit(0)
+    else:
+        print("failed. errors=", errors)
+        exit(-1)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="run.py",
+        description="Builds, runs, and tests the segment_alloc example",
+    )
+
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+    )
+    args = parser.parse_args()
+    main(experimental_passes=True, verbose=args.verbose)
diff --git a/programming_examples/segment_alloc/run_makefile.lit b/programming_examples/segment_alloc/run_makefile.lit
new file mode 100644
index 000000000..fe881ef0f
--- /dev/null
+++ b/programming_examples/segment_alloc/run_makefile.lit
@@ -0,0 +1,8 @@
+// (c) Copyright 2024 Advanced Micro Devices, Inc.
+ // SPDX-License-Identifier: MIT
+ //
+ // REQUIRES: ryzen_ai
+ //
+ // RUN: make -f %S/Makefile clean
+ // RUN: make -f %S/Makefile run | FileCheck %s
+ // CHECK: PASS!
diff --git a/programming_examples/segment_alloc/segment_alloc.py b/programming_examples/segment_alloc/segment_alloc.py
new file mode 100644
index 000000000..ad2c28a0f
--- /dev/null
+++ b/programming_examples/segment_alloc/segment_alloc.py
@@ -0,0 +1,109 @@
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: MIT
+
+from air.ir import *
+from air.dialects.air import *
+from air.dialects.memref import AllocOp, DeallocOp, load, store
+from air.dialects.func import FuncOp
+from air.dialects.scf import for_, yield_
+
+range_ = for_
+
+IMAGE_WIDTH = 32
+IMAGE_HEIGHT = 16
+IMAGE_SIZE = [IMAGE_WIDTH, IMAGE_HEIGHT]
+
+TILE_WIDTH = 16
+TILE_HEIGHT = 8
+TILE_SIZE = [TILE_WIDTH, TILE_HEIGHT]
+
+
+@module_builder
+def build_module():
+    memrefTyInOut = MemRefType.get(IMAGE_SIZE, T.i32())
+
+    # We will send an image worth of data in and out
+    @FuncOp.from_py_func(memrefTyInOut, memrefTyInOut)
+    def copy(arg0, arg1):
+
+        # The arguments are the input and output
+        @launch(operands=[arg0, arg1])
+        def launch_body(a, b):
+
+            # The arguments are still the input and the output
+            @segment(name="seg", operands=[a, b])
+            def segment_body(arg2, arg3):
+                # We want to store our data in L1 memory
+                mem_space_l2 = IntegerAttr.get(T.i32(), MemorySpace.L2)
+
+                # This is the type definition of the tile
+                tile_type_l2 = MemRefType.get(
+                    shape=TILE_SIZE,
+                    element_type=T.i32(),
+                    memory_space=mem_space_l2,
+                )
+
+                # We must allocate a buffer of tile size for the input/output
+                tile_in_l2 = AllocOp(tile_type_l2, [], [])
+
+                # The herd sizes correspond to the dimensions of the contiguous block of cores we are hoping to get.
+                # We just need one compute core, so we ask for a 1x1 herd
+                @herd(name="copyherd", sizes=[1, 1], operands=[arg2, arg3, tile_in_l2])
+                def herd_body(tx, ty, sx, sy, a, b, my_l2_tile):
+
+                    # We want to store our data in L1 memory
+                    mem_space_l1 = IntegerAttr.get(T.i32(), MemorySpace.L1)
+
+                    # This is the type definition of the tile
+                    tile_type_l1 = MemRefType.get(
+                        shape=TILE_SIZE,
+                        element_type=T.i32(),
+                        memory_space=mem_space_l1,
+                    )
+
+                    # We must allocate a buffer of tile size for the input/output
+                    tile_in_l1 = AllocOp(tile_type_l1, [], [])
+                    tile_out_l1 = AllocOp(tile_type_l1, [], [])
+
+                    dma_memcpy_nd(
+                        my_l2_tile,
+                        a,
+                        src_offsets=[0, 0],
+                        src_sizes=[TILE_HEIGHT, TILE_WIDTH],
+                        src_strides=[IMAGE_WIDTH, 1],
+                    )
+
+                    # Copy a tile from the input image (a) into the L1 memory region (tile_in)
+                    dma_memcpy_nd(
+                        tile_in_l1,
+                        my_l2_tile,
+                    )
+
+                    # Access every value in the tile
+                    for j in range_(TILE_HEIGHT):
+                        for i in range_(TILE_WIDTH):
+                            # Load the input value from tile_in
+                            val = load(tile_in_l1, [i, j])
+
+                            # Store the output value in tile_out
+                            store(val, tile_out_l1, [i, j])
+                            yield_([])
+                        yield_([])
+
+                    # Copy the output tile into the output
+                    dma_memcpy_nd(
+                        b,
+                        tile_out_l1,
+                        dst_offsets=[0, 0],
+                        dst_sizes=[TILE_HEIGHT, TILE_WIDTH],
+                        dst_strides=[IMAGE_WIDTH, 1],
+                    )
+
+                    # Deallocate our L1 buffers
+                    DeallocOp(tile_in_l1)
+                    DeallocOp(tile_out_l1)
+
+
+if __name__ == "__main__":
+    module = build_module()
+    print(module)
diff --git a/python/air/dialects/_air_ops_ext.py b/python/air/dialects/_air_ops_ext.py
index 8a5dd032d..749583ea4 100644
--- a/python/air/dialects/_air_ops_ext.py
+++ b/python/air/dialects/_air_ops_ext.py
@@ -23,6 +23,29 @@ def pyint_to_index(i):
     return arith.ConstantOp.create_index(i) if isinstance(i, int) else i
 
 
+def get_region_operand_types(operands):
+    """
+    Utility function to get the type of arguments given to region ops.
+    """
+    operand_types = []
+    for o in operands:
+        if isinstance(o, Value):
+            operand_types.append(o.type)
+        elif isinstance(o, OpView):
+            if len(o.results.types) != 1:
+                raise AttributeError(
+                    f"Operation given to a region op as a parameter ({o}) has more "
+                    "than one return type ({o.results.types}), which would lead to a mismatch "
+                    "between number of operands and number of operand types"
+                )
+            operand_types += o.results.types
+        else:
+            raise AttributeError(
+                f"Argument {o} is not a Value or an Operation: {type(o).mro()}"
+            )
+    return operand_types
+
+
 class Launch(LaunchOp):
     """Specialization for LaunchOp class."""
 
@@ -48,7 +71,7 @@ def __init__(
             launch_operands=operands,
             sym_name=name,
         )
-        operand_types = [s.type for s in sizes] * 2 + [o.type for o in operands]
+        operand_types = [s.type for s in sizes] * 2 + get_region_operand_types(operands)
         self.regions[0].blocks.append(*operand_types)
 
 
@@ -74,7 +97,7 @@ def __init__(
             segment_operands=operands,
             sym_name=name,
         )
-        operand_types = [s.type for s in sizes] * 2 + [o.type for o in operands]
+        operand_types = [s.type for s in sizes] * 2 + get_region_operand_types(operands)
         self.regions[0].blocks.append(*operand_types)
 
 
@@ -102,7 +125,7 @@ def __init__(
             sym_name=name,
             link_with=link_with,
         )
-        operand_types = [s.type for s in sizes] * 2 + [o.type for o in operands]
+        operand_types = [s.type for s in sizes] * 2 + get_region_operand_types(operands)
         self.regions[0].blocks.append(*operand_types)