diff --git a/programming_examples/channel_examples/README.md b/programming_examples/channel_examples/README.md index e68915329..7ccfc3baa 100644 --- a/programming_examples/channel_examples/README.md +++ b/programming_examples/channel_examples/README.md @@ -22,4 +22,26 @@ cd channel_size make clean && make ``` +#### WIP: ```worker-to-self```: + +This example ([worker_to_self/worker_to_self.py](worker_to_self/worker_to_self.py)) is a work-in-progress data passthrough example using the same tiling structure as the [matrix_scalar_add/multi_core_channel](../matrix_scalar_add/multi_core_channel.py) examples, only the sole worker in the herd does some extra shuffling between input and output by putting the current data tile into a channel and then getting it from the same channel. + +WARNING: This example currently fails because it is assumed channel gets/parts are not from the same memory region, and this example breaks this assumption. + +```bash +cd worker_to_self +make clean && make +``` + +#### WIP: ```worker-to-worker```: + +This example ([worker_to_worker/worker_to_worker.py](worker_to_worker/worker_to_worker.py)) is a work-in-progress data passthrough example using the same tiling structure as the [matrix_scalar_add/multi_core_channel](../matrix_scalar_add/multi_core_channel.py) examples, only the each worker trades a tile of input data to another worker in the herd by sending it via channel. + +WARNING: This example currently fails for unknown reasons. + +```bash +cd worker_to_worker +make clean && make +`` + #### WIP: more examples! \ No newline at end of file diff --git a/programming_examples/channel_examples/channel_size/channel_size.py b/programming_examples/channel_examples/channel_size/channel_size.py index ef3d5a4b8..3f64db5e1 100644 --- a/programming_examples/channel_examples/channel_size/channel_size.py +++ b/programming_examples/channel_examples/channel_size/channel_size.py @@ -21,10 +21,6 @@ assert IMAGE_HEIGHT % TILE_HEIGHT == 0 -def format_name(prefix, index_0, index_1): - return f"{prefix}{index_0:02}{index_1:02}" - - @module_builder def build_module(): memrefTyInOut = MemRefType.get(IMAGE_SIZE, T.i32()) diff --git a/programming_examples/channel_examples/channel_size/run.py b/programming_examples/channel_examples/channel_size/run.py index 39d8d40c0..0a3bb2dd3 100644 --- a/programming_examples/channel_examples/channel_size/run.py +++ b/programming_examples/channel_examples/channel_size/run.py @@ -33,7 +33,9 @@ def test_main(build_module, verbose=False): input_a[i] = i + 0x1000 input_b[i] = 0x00DEFACED - backend = xrt_backend.XRTBackend(verbose=verbose, omit_while_true_loop=True) + backend = xrt_backend.XRTBackend( + verbose=verbose, experimental_passes=True, omit_while_true_loop=True + ) if verbose: print_matrix(input_b) diff --git a/programming_examples/channel_examples/worker_to_self/Makefile b/programming_examples/channel_examples/worker_to_self/Makefile new file mode 100644 index 000000000..79be368b8 --- /dev/null +++ b/programming_examples/channel_examples/worker_to_self/Makefile @@ -0,0 +1,12 @@ +# (c) Copyright 2024 Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT +srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) + +targetname := $(shell basename ${srcdir}) + +run: + mkdir -p build + cd build && ${powershell} python3 ${srcdir}/run.py -v + +clean: + rm -rf build __pycache__ diff --git a/programming_examples/channel_examples/worker_to_self/run.py b/programming_examples/channel_examples/worker_to_self/run.py new file mode 100644 index 000000000..c5b7825d5 --- /dev/null +++ b/programming_examples/channel_examples/worker_to_self/run.py @@ -0,0 +1,88 @@ +# run.py -*- Python -*- +# +# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved. +# SPDX-License-Identifier: MIT +import argparse +import numpy as np +import air.backend.xrt as xrt_backend +import filelock + +from worker_to_self import * + +INOUT_DATATYPE = np.uint32 +INOUT_ELEM_SIZE = np.dtype(INOUT_DATATYPE).itemsize +INOUT_SIZE = IMAGE_SIZE[0] * IMAGE_SIZE[1] +INOUT_SIZE_BYTES = INOUT_SIZE * INOUT_ELEM_SIZE + + +def print_matrix(matrix_array): + for i in range(IMAGE_HEIGHT): + row = matrix_array[i * IMAGE_WIDTH : (i + 1) * IMAGE_WIDTH] + for val in row: + val = val & 0xFFFF + print(f"{val:04x}", end=" ") + print("") + + +def test_main(build_module, verbose=False): + mlir_module = build_module() + + input_a = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE) + input_b = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE) + for i in range(INOUT_SIZE): + input_a[i] = i + 0x1000 + input_b[i] = 0x00DEFACED + + backend = xrt_backend.XRTBackend(verbose=verbose, omit_while_true_loop=True) + + if verbose: + print_matrix(input_b) + + # run the module + with filelock.FileLock("/tmp/npu.lock"): + addone = backend.compile_and_load(mlir_module) + (_, output_b) = addone(input_a, input_b) + + backend.unload() + + if verbose: + print_matrix(output_b) + + # check output, should have all values incremented + errors = 0 + for i in range(INOUT_SIZE): + rb = output_b[i] + expected_value = input_a[i] + + # value should have been updated + if not (rb == expected_value): + """ + row = i // IMAGE_WIDTH + col = i % IMAGE_WIDTH + print( + f"IM {i} [{col}, {row}] should be 0x{expected_value:x}, is 0x{rb:x}\n" + ) + """ + errors += 1 + + if errors == 0: + print("PASS!") + exit(0) + else: + print("failed. errors=", errors) + exit(-1) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog="run.py", + description="Builds, runs, and tests the channel_examples/worker_to_self example", + ) + + parser.add_argument( + "-v", + "--verbose", + action="store_true", + ) + args = parser.parse_args() + test_main(build_module, verbose=args.verbose) diff --git a/programming_examples/channel_examples/worker_to_self/run_makefile.lit b/programming_examples/channel_examples/worker_to_self/run_makefile.lit new file mode 100644 index 000000000..07d105c0c --- /dev/null +++ b/programming_examples/channel_examples/worker_to_self/run_makefile.lit @@ -0,0 +1,9 @@ +// (c) Copyright 2024 Advanced Micro Devices, Inc. + // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + // + // REQUIRES: ryzen_ai + // + // RUN: make -f %S/Makefile clean + // RUN: make -f %S/Makefile run | FileCheck %s + // CHECK: PASS! + // XFAIL: * diff --git a/programming_examples/channel_examples/worker_to_self/worker_to_self.py b/programming_examples/channel_examples/worker_to_self/worker_to_self.py new file mode 100644 index 000000000..8060fe363 --- /dev/null +++ b/programming_examples/channel_examples/worker_to_self/worker_to_self.py @@ -0,0 +1,99 @@ +# Copyright (C) 2024, Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT + +from air.ir import * +from air.dialects.air import * +from air.dialects.memref import AllocOp, DeallocOp, load, store +from air.dialects.func import FuncOp +from air.dialects.scf import for_, yield_ + +range_ = for_ + +IMAGE_WIDTH = 32 +IMAGE_HEIGHT = 16 +IMAGE_SIZE = [IMAGE_WIDTH, IMAGE_HEIGHT] + + +@module_builder +def build_module(): + + # Type and method of input/output + memrefTyInOut = T.MemRefType.get(IMAGE_SIZE, T.i32()) + ChannelOp("ChanIn") + ChannelOp("ChanOut") + ChannelOp("ToSelf") + + # We want to store our data in L1 memory + mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1) + + # This is the type definition of the image + image_type = MemRefType.get( + shape=IMAGE_SIZE, + element_type=T.i32(), + memory_space=mem_space, + ) + + # We will send an image worth of data in and out + @FuncOp.from_py_func(memrefTyInOut, memrefTyInOut) + def copy(arg0, arg1): + + # The arguments are the input and output + @launch(operands=[arg0, arg1]) + def launch_body(a, b): + ChannelPut("ChanIn", a) + ChannelGet("ChanOut", b) + + # The arguments are still the input and the output + @segment(name="seg") + def segment_body(): + + # The herd sizes correspond to the dimensions of the contiguous block of cores we are hoping to get. + # We just need one compute core, so we ask for a 1x1 herd + @herd(name="copyherd", sizes=[1, 1]) + def herd_body(tx, ty, sx, sy): + + # We must allocate a buffer of image size for the input/output + tensor_in = AllocOp(image_type, [], []) + tensor_out = AllocOp(image_type, [], []) + tensor_in2 = AllocOp(image_type, [], []) + tensor_out2 = AllocOp(image_type, [], []) + + ChannelGet("ChanIn", tensor_in) + + # Access every value in the tile + for j in range_(IMAGE_HEIGHT): + for i in range_(IMAGE_WIDTH): + # Load the input value from tile_in + val = load(tensor_in, [i, j]) + + # Store the output value in tile_out + store(val, tensor_out, [i, j]) + yield_([]) + yield_([]) + + ChannelPut("ToSelf", tensor_out) + ChannelGet("ToSelf", tensor_in2) + + # Access every value in the tile + for j in range_(IMAGE_HEIGHT): + for i in range_(IMAGE_WIDTH): + # Load the input value from tile_in + val = load(tensor_in2, [i, j]) + + # Store the output value in tile_out + store(val, tensor_out2, [i, j]) + yield_([]) + yield_([]) + + ChannelPut("ChanOut", tensor_out2) + + # Deallocate our L1 buffers + DeallocOp(tensor_in) + DeallocOp(tensor_out) + DeallocOp(tensor_in2) + DeallocOp(tensor_out2) + + +if __name__ == "__main__": + module = build_module() + print(module) diff --git a/programming_examples/channel_examples/worker_to_worker/Makefile b/programming_examples/channel_examples/worker_to_worker/Makefile new file mode 100644 index 000000000..79be368b8 --- /dev/null +++ b/programming_examples/channel_examples/worker_to_worker/Makefile @@ -0,0 +1,12 @@ +# (c) Copyright 2024 Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT +srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) + +targetname := $(shell basename ${srcdir}) + +run: + mkdir -p build + cd build && ${powershell} python3 ${srcdir}/run.py -v + +clean: + rm -rf build __pycache__ diff --git a/programming_examples/channel_examples/worker_to_worker/run.py b/programming_examples/channel_examples/worker_to_worker/run.py new file mode 100644 index 000000000..b62fc4746 --- /dev/null +++ b/programming_examples/channel_examples/worker_to_worker/run.py @@ -0,0 +1,88 @@ +# run.py -*- Python -*- +# +# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved. +# SPDX-License-Identifier: MIT +import argparse +import numpy as np +import air.backend.xrt as xrt_backend +import filelock + +from worker_to_worker import * + +INOUT_DATATYPE = np.uint32 +INOUT_ELEM_SIZE = np.dtype(INOUT_DATATYPE).itemsize +INOUT_SIZE = IMAGE_SIZE[0] * IMAGE_SIZE[1] +INOUT_SIZE_BYTES = INOUT_SIZE * INOUT_ELEM_SIZE + + +def print_matrix(matrix_array): + for i in range(IMAGE_HEIGHT): + row = matrix_array[i * IMAGE_WIDTH : (i + 1) * IMAGE_WIDTH] + for val in row: + val = val & 0xFFFF + print(f"{val:04x}", end=" ") + print("") + + +def test_main(build_module, verbose=False): + mlir_module = build_module() + + input_a = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE) + input_b = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE) + for i in range(INOUT_SIZE): + input_a[i] = i + 0x1000 + input_b[i] = 0x00DEFACED + + backend = xrt_backend.XRTBackend(verbose=verbose, omit_while_true_loop=True) + + if verbose: + print_matrix(input_b) + + # run the module + with filelock.FileLock("/tmp/npu.lock"): + addone = backend.compile_and_load(mlir_module) + (_, output_b) = addone(input_a, input_b) + + backend.unload() + + if verbose: + print_matrix(output_b) + + # check output, should have all values incremented + errors = 0 + for i in range(INOUT_SIZE): + rb = output_b[i] + expected_value = input_a[i] + + # value should have been updated + if not (rb == expected_value): + """ + row = i // IMAGE_WIDTH + col = i % IMAGE_WIDTH + print( + f"IM {i} [{col}, {row}] should be 0x{expected_value:x}, is 0x{rb:x}\n" + ) + """ + errors += 1 + + if errors == 0: + print("PASS!") + exit(0) + else: + print("failed. errors=", errors) + exit(-1) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog="run.py", + description="Builds, runs, and tests the channel_examples/worker_to_worker example", + ) + + parser.add_argument( + "-v", + "--verbose", + action="store_true", + ) + args = parser.parse_args() + test_main(build_module, verbose=args.verbose) diff --git a/programming_examples/channel_examples/worker_to_worker/run_makefile.lit b/programming_examples/channel_examples/worker_to_worker/run_makefile.lit new file mode 100644 index 000000000..07d105c0c --- /dev/null +++ b/programming_examples/channel_examples/worker_to_worker/run_makefile.lit @@ -0,0 +1,9 @@ +// (c) Copyright 2024 Advanced Micro Devices, Inc. + // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + // + // REQUIRES: ryzen_ai + // + // RUN: make -f %S/Makefile clean + // RUN: make -f %S/Makefile run | FileCheck %s + // CHECK: PASS! + // XFAIL: * diff --git a/programming_examples/channel_examples/worker_to_worker/worker_to_worker.py b/programming_examples/channel_examples/worker_to_worker/worker_to_worker.py new file mode 100644 index 000000000..e1b8fa256 --- /dev/null +++ b/programming_examples/channel_examples/worker_to_worker/worker_to_worker.py @@ -0,0 +1,171 @@ +# Copyright (C) 2024, Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT + +from air.ir import * +from air.dialects.air import * +from air.dialects.memref import AllocOp, DeallocOp, load, store +from air.dialects.func import FuncOp +from air.dialects.scf import for_, yield_ +from air.dialects.affine import apply as affine_apply + +range_ = for_ + +IMAGE_WIDTH = 32 +IMAGE_HEIGHT = 16 +IMAGE_SIZE = [IMAGE_WIDTH, IMAGE_HEIGHT] + +TILE_WIDTH = 16 +TILE_HEIGHT = 8 +TILE_SIZE = [TILE_WIDTH, TILE_HEIGHT] + +assert IMAGE_WIDTH % TILE_WIDTH == 0 +assert IMAGE_HEIGHT % TILE_HEIGHT == 0 + + +@module_builder +def build_module(): + memrefTyInOut = MemRefType.get(IMAGE_SIZE, T.i32()) + + # Create an input/output channel pair per worker + ChannelOp("ChanIn", size=[IMAGE_WIDTH // TILE_WIDTH, IMAGE_HEIGHT // TILE_HEIGHT]) + ChannelOp("ChanOut", size=[IMAGE_WIDTH // TILE_WIDTH, IMAGE_HEIGHT // TILE_HEIGHT]) + ChannelOp( + "SwitchTiles", size=[IMAGE_WIDTH // TILE_WIDTH, IMAGE_HEIGHT // TILE_HEIGHT] + ) + + # We will send an image worth of data in and out + @FuncOp.from_py_func(memrefTyInOut, memrefTyInOut) + def copy(arg0, arg1): + + # The arguments are the input and output + @launch(operands=[arg0, arg1]) + def launch_body(a, b): + + # Transfer one tile of data per worker + for h in range(IMAGE_HEIGHT // TILE_HEIGHT): + for w in range(IMAGE_WIDTH // TILE_WIDTH): + offset0 = IMAGE_HEIGHT * h + offset1 = IMAGE_HEIGHT * w + + # Put data into the channel tile by tile + ChannelPut( + "ChanIn", + a, + indices=[w, h], + offsets=[offset0, offset1], + sizes=[TILE_HEIGHT, TILE_WIDTH], + strides=[IMAGE_WIDTH, 1], + ) + + # Transfer one tile of data per worker + for h in range(IMAGE_HEIGHT // TILE_HEIGHT): + for w in range(IMAGE_WIDTH // TILE_WIDTH): + offset0 = IMAGE_HEIGHT * h + offset1 = IMAGE_HEIGHT * w + + # Write data back out to the channel tile by tile + ChannelGet( + "ChanOut", + b, + indices=[w, h], + offsets=[offset0, offset1], + sizes=[TILE_HEIGHT, TILE_WIDTH], + strides=[IMAGE_WIDTH, 1], + ) + + # The arguments are still the input and the output + @segment(name="seg") + def segment_body(): + + @herd( + name="xaddherd", + sizes=[IMAGE_WIDTH // TILE_WIDTH, IMAGE_HEIGHT // TILE_HEIGHT], + ) + def herd_body(th, tw, _sx, _sy): + height_next = AffineMap.get( + 0, + 1, + [ + AffineExpr.get_mod( + AffineExpr.get_add( + AffineSymbolExpr.get(0), + AffineConstantExpr.get(1), + ), + IMAGE_HEIGHT // TILE_HEIGHT, + ) + ], + ) + width_next = AffineMap.get( + 0, + 1, + [ + AffineExpr.get_mod( + AffineExpr.get_add( + AffineSymbolExpr.get(0), + AffineConstantExpr.get(1), + ), + IMAGE_WIDTH // TILE_WIDTH, + ) + ], + ) + tw_next = affine_apply(width_next, [tw]) + th_next = affine_apply(height_next, [th]) + + # We want to store our data in L1 memory + mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1) + tile_type = MemRefType.get( + shape=TILE_SIZE, + element_type=T.i32(), + memory_space=mem_space, + ) + + # We must allocate a buffer of tile size for the input/output + tile_in = AllocOp(tile_type, [], []) + tile_in2 = AllocOp(tile_type, [], []) + tile_out = AllocOp(tile_type, [], []) + tile_out2 = AllocOp(tile_type, [], []) + + # Copy a tile from the input image + ChannelGet("ChanIn", tile_in, indices=[tw, th]) + + # Access every value in the tile + for j in range_(TILE_HEIGHT): + for i in range_(TILE_WIDTH): + # Load the input value from tile_in + val = load(tile_in, [i, j]) + + # Store the output value in tile_out + store(val, tile_out, [i, j]) + yield_([]) + yield_([]) + + # Copy the output tile into a channel for another worker to get + ChannelPut("SwitchTiles", tile_out, indices=[tw, th]) + + # Get an output tile from another worker + ChannelGet("SwitchTiles", tile_in2, indices=[tw_next, th_next]) + + # Access every value in the tile + for j in range_(TILE_HEIGHT): + for i in range_(TILE_WIDTH): + # Load the input value from tile_in + val = load(tile_in2, [i, j]) + + # Store the output value in tile_out + store(val, tile_out2, [i, j]) + yield_([]) + yield_([]) + + # Send the output tile to the output + ChannelPut("ChanOut", tile_out, indices=[tw, th]) + + # Deallocate our L1 buffers + DeallocOp(tile_in) + DeallocOp(tile_out) + DeallocOp(tile_in2) + DeallocOp(tile_out2) + + +if __name__ == "__main__": + module = build_module() + print(module)