Programming Example using ChannelOp size Parameter (#642)

* Add channel size example * Update documentation * modify expected output for verification * Fix a few small issues * Make channel size standalone * matrix scalar add example is working * channel size example is working * Realized that there is not really common code in the channel examples so made herd_to_herd standalone too. Also updated docs * Clarify example documentation
Xilinx · Jul 8, 2024 · 6f6f639 · 6f6f639
1 parent ae11b76
commit 6f6f639
Show file tree

Hide file tree

Showing 7 changed files with 250 additions and 40 deletions.
diff --git a/programming_examples/channel_examples/README.md b/programming_examples/channel_examples/README.md
@@ -13,4 +13,13 @@ cd herd_to_herd
 make clean && make
 ```
 
+#### ```channel-size```: Use the channel size argument
+
+This example ([channel_size/channel_size.py](channel_size/channel_size.py)) is a data passthrough example using the same tiling structure as the [matrix_scalar_add/multi_core_channel](../matrix_scalar_add/multi_core_channel.py) examples, only instead of using a separately defined channel for each tile/core, a bundle of channels is created (using the `ChannelOp` `size` parameter) and indexed into (the `ChannelGet` and `ChannelPut` `indices` parameter).
+
+```bash
+cd channel_size
+make clean && make
+```
+
 #### WIP: more examples!
diff --git a/programming_examples/channel_examples/channel_size/Makefile b/programming_examples/channel_examples/channel_size/Makefile
@@ -0,0 +1,12 @@
+# Copyright (C) 2022, Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: MIT
+srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
+
+targetname := $(shell basename ${srcdir})
+
+run:
+	mkdir -p build
+	cd build && ${powershell} python3 ${srcdir}/run.py
+
+clean:
+	rm -rf build __pycache__
diff --git a/programming_examples/channel_examples/channel_size/channel_size.py b/programming_examples/channel_examples/channel_size/channel_size.py
@@ -0,0 +1,124 @@
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: MIT
+
+from air.ir import *
+from air.dialects.air import *
+from air.dialects.memref import AllocOp, DeallocOp, load, store
+from air.dialects.func import FuncOp
+from air.dialects.scf import for_, yield_
+
+range_ = for_
+
+IMAGE_WIDTH = 32
+IMAGE_HEIGHT = 16
+IMAGE_SIZE = [IMAGE_WIDTH, IMAGE_HEIGHT]
+
+TILE_WIDTH = 16
+TILE_HEIGHT = 8
+TILE_SIZE = [TILE_WIDTH, TILE_HEIGHT]
+
+assert IMAGE_WIDTH % TILE_WIDTH == 0
+assert IMAGE_HEIGHT % TILE_HEIGHT == 0
+
+
+def format_name(prefix, index_0, index_1):
+    return f"{prefix}{index_0:02}{index_1:02}"
+
+
+@module_builder
+def build_module():
+    memrefTyInOut = MemRefType.get(IMAGE_SIZE, T.i32())
+
+    # Create an input/output channel pair per worker
+    ChannelOp("ChanIn", size=[IMAGE_WIDTH // TILE_WIDTH, IMAGE_HEIGHT // TILE_HEIGHT])
+    ChannelOp("ChanOut", size=[IMAGE_WIDTH // TILE_WIDTH, IMAGE_HEIGHT // TILE_HEIGHT])
+
+    # We will send an image worth of data in and out
+    @FuncOp.from_py_func(memrefTyInOut, memrefTyInOut)
+    def copy(arg0, arg1):
+
+        # The arguments are the input and output
+        @launch(operands=[arg0, arg1])
+        def launch_body(a, b):
+
+            # Transfer one tile of data per worker
+            for h in range(IMAGE_HEIGHT // TILE_HEIGHT):
+                for w in range(IMAGE_WIDTH // TILE_WIDTH):
+                    offset0 = IMAGE_HEIGHT * h
+                    offset1 = IMAGE_HEIGHT * w
+
+                    # Put data into the channel tile by tile
+                    ChannelPut(
+                        "ChanIn",
+                        a,
+                        indices=[w, h],
+                        offsets=[offset0, offset1],
+                        sizes=[TILE_HEIGHT, TILE_WIDTH],
+                        strides=[IMAGE_WIDTH, 1],
+                    )
+
+            # Transfer one tile of data per worker
+            for h in range(IMAGE_HEIGHT // TILE_HEIGHT):
+                for w in range(IMAGE_WIDTH // TILE_WIDTH):
+                    offset0 = IMAGE_HEIGHT * h
+                    offset1 = IMAGE_HEIGHT * w
+
+                    # Write data back out to the channel tile by tile
+                    ChannelGet(
+                        "ChanOut",
+                        b,
+                        indices=[w, h],
+                        offsets=[offset0, offset1],
+                        sizes=[TILE_HEIGHT, TILE_WIDTH],
+                        strides=[IMAGE_WIDTH, 1],
+                    )
+
+            # The arguments are still the input and the output
+            @segment(name="seg")
+            def segment_body():
+
+                @herd(
+                    name="xaddherd",
+                    sizes=[IMAGE_WIDTH // TILE_WIDTH, IMAGE_HEIGHT // TILE_HEIGHT],
+                )
+                def herd_body(th, tw, _sx, _sy):
+
+                    # We want to store our data in L1 memory
+                    mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1)
+
+                    # This is the type definition of the tile
+                    tile_type = MemRefType.get(
+                        shape=TILE_SIZE,
+                        element_type=T.i32(),
+                        memory_space=mem_space,
+                    )
+
+                    # We must allocate a buffer of tile size for the input/output
+                    tile_in = AllocOp(tile_type, [], [])
+                    tile_out = AllocOp(tile_type, [], [])
+
+                    # Copy a tile from the input image (a) into the L1 memory region (tile_in)
+                    ChannelGet("ChanIn", tile_in, indices=[tw, th])
+
+                    # Access every value in the tile
+                    for j in range_(TILE_HEIGHT):
+                        for i in range_(TILE_WIDTH):
+                            # Load the input value from tile_in
+                            val = load(tile_in, [i, j])
+
+                            # Store the output value in tile_out
+                            store(val, tile_out, [i, j])
+                            yield_([])
+                        yield_([])
+
+                    # Copy the output tile into the output
+                    ChannelPut("ChanOut", tile_out, indices=[tw, th])
+
+                    # Deallocate our L1 buffers
+                    DeallocOp(tile_in)
+                    DeallocOp(tile_out)
+
+
+if __name__ == "__main__":
+    module = build_module()
+    print(module)
diff --git a/...mming_examples/channel_examples/common.py → ...ples/channel_examples/channel_size/run.py b/...mming_examples/channel_examples/common.py → ...ples/channel_examples/channel_size/run.py
@@ -1,13 +1,13 @@
-# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# run.py -*- Python -*-
+#
+# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: MIT
-
+import argparse
 import numpy as np
 import air.backend.xrt as xrt_backend
 import filelock
 
-IMAGE_WIDTH = 32
-IMAGE_HEIGHT = 16
-IMAGE_SIZE = [IMAGE_WIDTH, IMAGE_HEIGHT]
+from channel_size import *
 
 INOUT_DATATYPE = np.uint32
 INOUT_ELEM_SIZE = np.dtype(INOUT_DATATYPE).itemsize
@@ -30,10 +30,9 @@ def test_main(build_module, verbose=False):
     input_a = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
     input_b = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
     for i in range(INOUT_SIZE):
-        input_a[i] = 0x2
-        input_b[i] = 0x00C0FFEE
+        input_a[i] = i + 0x1000
+        input_b[i] = 0x00DEFACED
 
-    # TODO(hunhoffe): need to figure out why single-core-dma fails with experimental_passes=True
     backend = xrt_backend.XRTBackend(verbose=verbose, omit_while_true_loop=True)
 
     if verbose:
@@ -53,12 +52,12 @@ def test_main(build_module, verbose=False):
     errors = 0
     for i in range(INOUT_SIZE):
         rb = output_b[i]
+        expected_value = input_a[i]
 
         row = i // IMAGE_WIDTH
         col = i % IMAGE_WIDTH
 
         # value should have been updated
-        expected_value = 0x2 * 0x2 + 1
         if not (rb == expected_value):
             """
             print(
@@ -73,3 +72,18 @@ def test_main(build_module, verbose=False):
     else:
         print("failed. errors=", errors)
         exit(-1)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="run.py",
+        description="Builds, runs, and tests the channel_examples/herd_to_herd example",
+    )
+
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+    )
+    args = parser.parse_args()
+    test_main(build_module, verbose=args.verbose)
diff --git a/programming_examples/channel_examples/channel_size/run_makefile.lit b/programming_examples/channel_examples/channel_size/run_makefile.lit
@@ -0,0 +1,8 @@
+// (c) Copyright 2024 Advanced Micro Devices, Inc.
+ // SPDX-License-Identifier: MIT
+ //
+ // REQUIRES: ryzen_ai
+ //
+ // RUN: make -f %S/Makefile clean
+ // RUN: make -f %S/Makefile run | FileCheck %s
+ // CHECK: PASS!
diff --git a/programming_examples/channel_examples/herd_to_herd/herd_to_herd.py b/programming_examples/channel_examples/herd_to_herd/herd_to_herd.py
@@ -1,18 +1,5 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import sys
-from pathlib import Path  # if you haven't already done so
-
-# Python paths are a bit complex. Taking solution from : https://stackoverflow.com/questions/16981921/relative-imports-in-python-3
-file = Path(__file__).resolve()
-parent, root = file.parent, file.parents[1]
-sys.path.append(str(root))
-
-# Additionally remove the current file's directory from sys.path
-try:
-    sys.path.remove(str(parent))
-except ValueError:  # Already removed
-    pass
 
 from air.ir import *
 from air.dialects.air import *
@@ -24,8 +11,9 @@
 
 range_ = for_
 
-
-from common import *
+IMAGE_WIDTH = 32
+IMAGE_HEIGHT = 16
+IMAGE_SIZE = [IMAGE_WIDTH, IMAGE_HEIGHT]
 
 
 @module_builder

diff --git a/programming_examples/channel_examples/herd_to_herd/run.py b/programming_examples/channel_examples/herd_to_herd/run.py
@@ -3,22 +3,77 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: MIT
 import argparse
-import sys
-from pathlib import Path  # if you haven't already done so
-
-# Python paths are a bit complex. Taking solution from : https://stackoverflow.com/questions/16981921/relative-imports-in-python-3
-file = Path(__file__).resolve()
-parent, root = file.parent, file.parents[1]
-sys.path.append(str(root))
-
-# Additionally remove the current file's directory from sys.path
-try:
-    sys.path.remove(str(parent))
-except ValueError:  # Already removed
-    pass
-
-from herd_to_herd.herd_to_herd import build_module
-from common import test_main
+import numpy as np
+import air.backend.xrt as xrt_backend
+import filelock
+
+from herd_to_herd import *
+
+INOUT_DATATYPE = np.uint32
+INOUT_ELEM_SIZE = np.dtype(INOUT_DATATYPE).itemsize
+INOUT_SIZE = IMAGE_SIZE[0] * IMAGE_SIZE[1]
+INOUT_SIZE_BYTES = INOUT_SIZE * INOUT_ELEM_SIZE
+
+
+def print_matrix(matrix_array):
+    for i in range(IMAGE_HEIGHT):
+        row = matrix_array[i * IMAGE_WIDTH : (i + 1) * IMAGE_WIDTH]
+        for val in row:
+            val = val & 0xFFFF
+            print(f"{val:04x}", end=" ")
+        print("")
+
+
+def test_main(build_module, verbose=False):
+    mlir_module = build_module()
+
+    input_a = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
+    input_b = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
+    for i in range(INOUT_SIZE):
+        input_a[i] = 0x2
+        input_b[i] = 0x00C0FFEE
+
+    # TODO(hunhoffe): need to figure out why single-core-dma fails with experimental_passes=True
+    backend = xrt_backend.XRTBackend(verbose=verbose, omit_while_true_loop=True)
+
+    if verbose:
+        print_matrix(input_b)
+
+    # run the module
+    with filelock.FileLock("/tmp/npu.lock"):
+        addone = backend.compile_and_load(mlir_module)
+        (_, output_b) = addone(input_a, input_b)
+
+    backend.unload()
+
+    if verbose:
+        print_matrix(output_b)
+
+    # check output, should have all values incremented
+    errors = 0
+    for i in range(INOUT_SIZE):
+        rb = output_b[i]
+
+        row = i // IMAGE_WIDTH
+        col = i % IMAGE_WIDTH
+
+        # value should have been updated
+        expected_value = 0x2 * 0x2 + 1
+        if not (rb == expected_value):
+            """
+            print(
+                f"IM {i} [{col}, {row}] should be 0x{expected_value:x}, is 0x{rb:x}\n"
+            )
+            """
+            errors += 1
+
+    if errors == 0:
+        print("PASS!")
+        exit(0)
+    else:
+        print("failed. errors=", errors)
+        exit(-1)
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(