Xilinx · hunhoffe · Jul 10, 2024 · Jul 2, 2024 · Jul 2, 2024 · Jul 2, 2024
@@ -22,4 +22,26 @@ cd channel_size
 make clean && make
 ```
 
+#### WIP: ```worker-to-self```:
+
+This example ([worker_to_self/worker_to_self.py](worker_to_self/worker_to_self.py)) is a work-in-progress data passthrough example using the same tiling structure as the [matrix_scalar_add/multi_core_channel](../matrix_scalar_add/multi_core_channel.py) examples, only the sole worker in the herd does some extra shuffling between input and output by putting the current data tile into a channel and then getting it from the same channel.
+
+WARNING: This example currently fails because it is assumed channel gets/parts are not from the same memory region, and this example breaks this assumption.
+
+```bash
+cd worker_to_self
+make clean && make
+```
+
+#### WIP: ```worker-to-worker```:
+
+This example ([worker_to_worker/worker_to_worker.py](worker_to_worker/worker_to_worker.py)) is a work-in-progress data passthrough example using the same tiling structure as the [matrix_scalar_add/multi_core_channel](../matrix_scalar_add/multi_core_channel.py) examples, only the each worker trades a tile of input data to another worker in the herd by sending it via channel.
+
+WARNING: This example currently fails for unknown reasons.
+
+```bash
+cd worker_to_worker
+make clean && make
+``
+
 #### WIP: more examples!
@@ -21,10 +21,6 @@
 assert IMAGE_HEIGHT % TILE_HEIGHT == 0
 
 
-def format_name(prefix, index_0, index_1):
-    return f"{prefix}{index_0:02}{index_1:02}"
-
-
 @module_builder
 def build_module():
     memrefTyInOut = MemRefType.get(IMAGE_SIZE, T.i32())

@@ -33,7 +33,9 @@ def test_main(build_module, verbose=False):
         input_a[i] = i + 0x1000
         input_b[i] = 0x00DEFACED
 
-    backend = xrt_backend.XRTBackend(verbose=verbose, omit_while_true_loop=True)
+    backend = xrt_backend.XRTBackend(
+        verbose=verbose, experimental_passes=True, omit_while_true_loop=True
+    )
 
     if verbose:
         print_matrix(input_b)

@@ -0,0 +1,12 @@
+# (c) Copyright 2024 Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: MIT
+srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
+
+targetname := $(shell basename ${srcdir})
+
+run:
+	mkdir -p build
+	cd build && ${powershell} python3 ${srcdir}/run.py -v
+
+clean:
+	rm -rf build __pycache__
@@ -0,0 +1,88 @@
+# run.py -*- Python -*-
+#
+# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: MIT
+import argparse
+import numpy as np
+import air.backend.xrt as xrt_backend
+import filelock
+
+from worker_to_self import *
+
+INOUT_DATATYPE = np.uint32
+INOUT_ELEM_SIZE = np.dtype(INOUT_DATATYPE).itemsize
+INOUT_SIZE = IMAGE_SIZE[0] * IMAGE_SIZE[1]
+INOUT_SIZE_BYTES = INOUT_SIZE * INOUT_ELEM_SIZE
+
+
+def print_matrix(matrix_array):
+    for i in range(IMAGE_HEIGHT):
+        row = matrix_array[i * IMAGE_WIDTH : (i + 1) * IMAGE_WIDTH]
+        for val in row:
+            val = val & 0xFFFF
+            print(f"{val:04x}", end=" ")
+        print("")
+
+
+def test_main(build_module, verbose=False):
+    mlir_module = build_module()
+
+    input_a = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
+    input_b = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
+    for i in range(INOUT_SIZE):
+        input_a[i] = i + 0x1000
+        input_b[i] = 0x00DEFACED
+
+    backend = xrt_backend.XRTBackend(verbose=verbose, omit_while_true_loop=True)
+
+    if verbose:
+        print_matrix(input_b)
+
+    # run the module
+    with filelock.FileLock("/tmp/npu.lock"):
+        addone = backend.compile_and_load(mlir_module)
+        (_, output_b) = addone(input_a, input_b)
+
+    backend.unload()
+
+    if verbose:
+        print_matrix(output_b)
+
+    # check output, should have all values incremented
+    errors = 0
+    for i in range(INOUT_SIZE):
+        rb = output_b[i]
+        expected_value = input_a[i]
+
+        # value should have been updated
+        if not (rb == expected_value):
+            """
+            row = i // IMAGE_WIDTH
+            col = i % IMAGE_WIDTH
+            print(
+                f"IM {i} [{col}, {row}] should be 0x{expected_value:x}, is 0x{rb:x}\n"
+            )
+            """
+            errors += 1
+
+    if errors == 0:
+        print("PASS!")
+        exit(0)
+    else:
+        print("failed. errors=", errors)
+        exit(-1)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="run.py",
+        description="Builds, runs, and tests the channel_examples/worker_to_self example",
+    )
+
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+    )
+    args = parser.parse_args()
+    test_main(build_module, verbose=args.verbose)
@@ -0,0 +1,9 @@
+// (c) Copyright 2024 Advanced Micro Devices, Inc.
+ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ //
+ // REQUIRES: ryzen_ai
+ //
+ // RUN: make -f %S/Makefile clean
+ // RUN: make -f %S/Makefile run | FileCheck %s
+ // CHECK: PASS!
+ // XFAIL: *
@@ -0,0 +1,99 @@
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: MIT
+
+from air.ir import *
+from air.dialects.air import *
+from air.dialects.memref import AllocOp, DeallocOp, load, store
+from air.dialects.func import FuncOp
+from air.dialects.scf import for_, yield_
+
+range_ = for_
+
+IMAGE_WIDTH = 32
+IMAGE_HEIGHT = 16
+IMAGE_SIZE = [IMAGE_WIDTH, IMAGE_HEIGHT]
+
+
+@module_builder
+def build_module():
+
+    # Type and method of input/output
+    memrefTyInOut = T.MemRefType.get(IMAGE_SIZE, T.i32())
+    ChannelOp("ChanIn")
+    ChannelOp("ChanOut")
+    ChannelOp("ToSelf")
+
+    # We want to store our data in L1 memory
+    mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1)
+
+    # This is the type definition of the image
+    image_type = MemRefType.get(
+        shape=IMAGE_SIZE,
+        element_type=T.i32(),
+        memory_space=mem_space,
+    )
+
+    # We will send an image worth of data in and out
+    @FuncOp.from_py_func(memrefTyInOut, memrefTyInOut)
+    def copy(arg0, arg1):
+
+        # The arguments are the input and output
+        @launch(operands=[arg0, arg1])
+        def launch_body(a, b):
+            ChannelPut("ChanIn", a)
+            ChannelGet("ChanOut", b)
+
+            # The arguments are still the input and the output
+            @segment(name="seg")
+            def segment_body():
+
+                # The herd sizes correspond to the dimensions of the contiguous block of cores we are hoping to get.
+                # We just need one compute core, so we ask for a 1x1 herd
+                @herd(name="copyherd", sizes=[1, 1])
+                def herd_body(tx, ty, sx, sy):
+
+                    # We must allocate a buffer of image size for the input/output
+                    tensor_in = AllocOp(image_type, [], [])
+                    tensor_out = AllocOp(image_type, [], [])
+                    tensor_in2 = AllocOp(image_type, [], [])
+                    tensor_out2 = AllocOp(image_type, [], [])
+
+                    ChannelGet("ChanIn", tensor_in)
+
+                    # Access every value in the tile
+                    for j in range_(IMAGE_HEIGHT):
+                        for i in range_(IMAGE_WIDTH):
+                            # Load the input value from tile_in
+                            val = load(tensor_in, [i, j])
+
+                            # Store the output value in tile_out
+                            store(val, tensor_out, [i, j])
+                            yield_([])
+                        yield_([])
+
+                    ChannelPut("ToSelf", tensor_out)
+                    ChannelGet("ToSelf", tensor_in2)
+
+                    # Access every value in the tile
+                    for j in range_(IMAGE_HEIGHT):
+                        for i in range_(IMAGE_WIDTH):
+                            # Load the input value from tile_in
+                            val = load(tensor_in2, [i, j])
+
+                            # Store the output value in tile_out
+                            store(val, tensor_out2, [i, j])
+                            yield_([])
+                        yield_([])
+
+                    ChannelPut("ChanOut", tensor_out2)
+
+                    # Deallocate our L1 buffers
+                    DeallocOp(tensor_in)
+                    DeallocOp(tensor_out)
+                    DeallocOp(tensor_in2)
+                    DeallocOp(tensor_out2)
+
+
+if __name__ == "__main__":
+    module = build_module()
+    print(module)
@@ -0,0 +1,12 @@
+# (c) Copyright 2024 Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: MIT
+srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
+
+targetname := $(shell basename ${srcdir})
+
+run:
+	mkdir -p build
+	cd build && ${powershell} python3 ${srcdir}/run.py -v
+
+clean:
+	rm -rf build __pycache__
@@ -0,0 +1,88 @@
+# run.py -*- Python -*-
+#
+# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: MIT
+import argparse
+import numpy as np
+import air.backend.xrt as xrt_backend
+import filelock
+
+from worker_to_worker import *
+
+INOUT_DATATYPE = np.uint32
+INOUT_ELEM_SIZE = np.dtype(INOUT_DATATYPE).itemsize
+INOUT_SIZE = IMAGE_SIZE[0] * IMAGE_SIZE[1]
+INOUT_SIZE_BYTES = INOUT_SIZE * INOUT_ELEM_SIZE
+
+
+def print_matrix(matrix_array):
+    for i in range(IMAGE_HEIGHT):
+        row = matrix_array[i * IMAGE_WIDTH : (i + 1) * IMAGE_WIDTH]
+        for val in row:
+            val = val & 0xFFFF
+            print(f"{val:04x}", end=" ")
+        print("")
+
+
+def test_main(build_module, verbose=False):
+    mlir_module = build_module()
+
+    input_a = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
+    input_b = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
+    for i in range(INOUT_SIZE):
+        input_a[i] = i + 0x1000
+        input_b[i] = 0x00DEFACED
+
+    backend = xrt_backend.XRTBackend(verbose=verbose, omit_while_true_loop=True)
+
+    if verbose:
+        print_matrix(input_b)
+
+    # run the module
+    with filelock.FileLock("/tmp/npu.lock"):
+        addone = backend.compile_and_load(mlir_module)
+        (_, output_b) = addone(input_a, input_b)
+
+    backend.unload()
+
+    if verbose:
+        print_matrix(output_b)
+
+    # check output, should have all values incremented
+    errors = 0
+    for i in range(INOUT_SIZE):
+        rb = output_b[i]
+        expected_value = input_a[i]
+
+        # value should have been updated
+        if not (rb == expected_value):
+            """
+            row = i // IMAGE_WIDTH
+            col = i % IMAGE_WIDTH
+            print(
+                f"IM {i} [{col}, {row}] should be 0x{expected_value:x}, is 0x{rb:x}\n"
+            )
+            """
+            errors += 1
+
+    if errors == 0:
+        print("PASS!")
+        exit(0)
+    else:
+        print("failed. errors=", errors)
+        exit(-1)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="run.py",
+        description="Builds, runs, and tests the channel_examples/worker_to_worker example",
+    )
+
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+    )
+    args = parser.parse_args()
+    test_main(build_module, verbose=args.verbose)
@@ -0,0 +1,9 @@
+// (c) Copyright 2024 Advanced Micro Devices, Inc.
+ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ //
+ // REQUIRES: ryzen_ai
+ //
+ // RUN: make -f %S/Makefile clean
+ // RUN: make -f %S/Makefile run | FileCheck %s
+ // CHECK: PASS!
+ // XFAIL: *