From fbdcad320857d3a534ea6d955279f0d0f4c3a99c Mon Sep 17 00:00:00 2001 From: Erika Hunhoff Date: Tue, 16 Jul 2024 19:05:56 -0600 Subject: [PATCH] Access L2 Segment Allocation in Herd with Python Example (#668) --- .../worker_to_self/worker_to_self.py | 62 +++++----- programming_examples/segment_alloc/Makefile | 12 ++ programming_examples/segment_alloc/run.py | 88 ++++++++++++++ .../segment_alloc/run_makefile.lit | 8 ++ .../segment_alloc/segment_alloc.py | 109 ++++++++++++++++++ python/air/dialects/_air_ops_ext.py | 29 ++++- 6 files changed, 272 insertions(+), 36 deletions(-) create mode 100644 programming_examples/segment_alloc/Makefile create mode 100644 programming_examples/segment_alloc/run.py create mode 100644 programming_examples/segment_alloc/run_makefile.lit create mode 100644 programming_examples/segment_alloc/segment_alloc.py diff --git a/programming_examples/channel_examples/worker_to_self/worker_to_self.py b/programming_examples/channel_examples/worker_to_self/worker_to_self.py index 8060fe363..ee9f68293 100644 --- a/programming_examples/channel_examples/worker_to_self/worker_to_self.py +++ b/programming_examples/channel_examples/worker_to_self/worker_to_self.py @@ -23,14 +23,18 @@ def build_module(): ChannelOp("ChanOut") ChannelOp("ToSelf") - # We want to store our data in L1 memory - mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1) + mem_space_l1 = IntegerAttr.get(T.i32(), MemorySpace.L1) + image_type_l1 = MemRefType.get( + shape=IMAGE_SIZE, + element_type=T.i32(), + memory_space=mem_space_l1, + ) - # This is the type definition of the image - image_type = MemRefType.get( + mem_space_l2 = IntegerAttr.get(T.i32(), MemorySpace.L2) + image_type_l2 = MemRefType.get( shape=IMAGE_SIZE, element_type=T.i32(), - memory_space=mem_space, + memory_space=mem_space_l2, ) # We will send an image worth of data in and out @@ -47,51 +51,43 @@ def launch_body(a, b): @segment(name="seg") def segment_body(): + tensor_in_l2 = AllocOp(image_type_l2, [], []) + ChannelGet("ChanIn", tensor_in_l2) + # The herd sizes correspond to the dimensions of the contiguous block of cores we are hoping to get. # We just need one compute core, so we ask for a 1x1 herd - @herd(name="copyherd", sizes=[1, 1]) - def herd_body(tx, ty, sx, sy): + @herd( + name="copyherd", + sizes=[1, 1], + operands=[tensor_in_l2], + ) + def herd_body(tx, ty, sx, sy, tensor_in_l2): # We must allocate a buffer of image size for the input/output - tensor_in = AllocOp(image_type, [], []) - tensor_out = AllocOp(image_type, [], []) - tensor_in2 = AllocOp(image_type, [], []) - tensor_out2 = AllocOp(image_type, [], []) + tensor_in_l1 = AllocOp(image_type_l1, [], []) + tensor_out_l1 = AllocOp(image_type_l1, [], []) - ChannelGet("ChanIn", tensor_in) + ChannelPut("ToSelf", tensor_in_l2) + ChannelGet("ToSelf", tensor_in_l1) # Access every value in the tile for j in range_(IMAGE_HEIGHT): for i in range_(IMAGE_WIDTH): # Load the input value from tile_in - val = load(tensor_in, [i, j]) + val = load(tensor_in_l1, [i, j]) # Store the output value in tile_out - store(val, tensor_out, [i, j]) + store(val, tensor_out_l1, [i, j]) yield_([]) yield_([]) - ChannelPut("ToSelf", tensor_out) - ChannelGet("ToSelf", tensor_in2) - - # Access every value in the tile - for j in range_(IMAGE_HEIGHT): - for i in range_(IMAGE_WIDTH): - # Load the input value from tile_in - val = load(tensor_in2, [i, j]) - - # Store the output value in tile_out - store(val, tensor_out2, [i, j]) - yield_([]) - yield_([]) - - ChannelPut("ChanOut", tensor_out2) + ChannelPut("ChanOut", tensor_out_l1) # Deallocate our L1 buffers - DeallocOp(tensor_in) - DeallocOp(tensor_out) - DeallocOp(tensor_in2) - DeallocOp(tensor_out2) + DeallocOp(tensor_in_l1) + DeallocOp(tensor_out_l1) + + DeallocOp(tensor_in_l2) if __name__ == "__main__": diff --git a/programming_examples/segment_alloc/Makefile b/programming_examples/segment_alloc/Makefile new file mode 100644 index 000000000..e25a18738 --- /dev/null +++ b/programming_examples/segment_alloc/Makefile @@ -0,0 +1,12 @@ +# (c) Copyright 2024 Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT +srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) + +targetname := $(shell basename ${srcdir}) + +run: + mkdir -p build + cd build && ${powershell} python3 ${srcdir}/run.py + +clean: + rm -rf build __pycache__ \ No newline at end of file diff --git a/programming_examples/segment_alloc/run.py b/programming_examples/segment_alloc/run.py new file mode 100644 index 000000000..332ae67b6 --- /dev/null +++ b/programming_examples/segment_alloc/run.py @@ -0,0 +1,88 @@ +# run.py -*- Python -*- +# +# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved. +# SPDX-License-Identifier: MIT +# Copyright (C) 2024, Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT + +import argparse +import numpy as np +import air.backend.xrt as xrt_backend +import filelock + +from segment_alloc import * + +INOUT_DATATYPE = np.uint32 +INOUT_ELEM_SIZE = np.dtype(INOUT_DATATYPE).itemsize +INOUT_SIZE = IMAGE_SIZE[0] * IMAGE_SIZE[1] +INOUT_SIZE_BYTES = INOUT_SIZE * INOUT_ELEM_SIZE + + +def main(verbose=False, experimental_passes=False): + mlir_module = build_module() + + input_a = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE) + output_b = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE) + for i in range(INOUT_SIZE): + input_a[i] = i + 0x1000 + output_b[i] = 0x00DEFACED + + backend = xrt_backend.XRTBackend( + verbose=verbose, + experimental_passes=experimental_passes, + omit_while_true_loop=True, + ) + + # run the module + with filelock.FileLock("/tmp/npu.lock"): + mul = backend.compile_and_load(mlir_module) + (_, output_b) = mul(input_a, output_b) + + backend.unload() + + # check output, should have the top left filled in + errors = 0 + for i in range(INOUT_SIZE): + rb = output_b[i] + + row = i / IMAGE_WIDTH + col = i % IMAGE_WIDTH + + if row < TILE_HEIGHT and col < TILE_WIDTH: + # value should have been updated + if not (rb == 0x1000 + i): + print(f"IM {i} [{col}, {row}] should be 0x{i:x}, is 0x{rb:x}\n") + errors += 1 + else: + # value should stay unchanged + if rb != 0x00DEFACED: + print( + f"IM {i} [{col}, {row}] should be 0xdefaced, is 0x{rb:x}\n", + i, + col, + row, + rb, + ) + errors += 1 + + if errors == 0: + print("PASS!") + exit(0) + else: + print("failed. errors=", errors) + exit(-1) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog="run.py", + description="Builds, runs, and tests the segment_alloc example", + ) + + parser.add_argument( + "-v", + "--verbose", + action="store_true", + ) + args = parser.parse_args() + main(experimental_passes=True, verbose=args.verbose) diff --git a/programming_examples/segment_alloc/run_makefile.lit b/programming_examples/segment_alloc/run_makefile.lit new file mode 100644 index 000000000..fe881ef0f --- /dev/null +++ b/programming_examples/segment_alloc/run_makefile.lit @@ -0,0 +1,8 @@ +// (c) Copyright 2024 Advanced Micro Devices, Inc. + // SPDX-License-Identifier: MIT + // + // REQUIRES: ryzen_ai + // + // RUN: make -f %S/Makefile clean + // RUN: make -f %S/Makefile run | FileCheck %s + // CHECK: PASS! diff --git a/programming_examples/segment_alloc/segment_alloc.py b/programming_examples/segment_alloc/segment_alloc.py new file mode 100644 index 000000000..ad2c28a0f --- /dev/null +++ b/programming_examples/segment_alloc/segment_alloc.py @@ -0,0 +1,109 @@ +# Copyright (C) 2024, Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT + +from air.ir import * +from air.dialects.air import * +from air.dialects.memref import AllocOp, DeallocOp, load, store +from air.dialects.func import FuncOp +from air.dialects.scf import for_, yield_ + +range_ = for_ + +IMAGE_WIDTH = 32 +IMAGE_HEIGHT = 16 +IMAGE_SIZE = [IMAGE_WIDTH, IMAGE_HEIGHT] + +TILE_WIDTH = 16 +TILE_HEIGHT = 8 +TILE_SIZE = [TILE_WIDTH, TILE_HEIGHT] + + +@module_builder +def build_module(): + memrefTyInOut = MemRefType.get(IMAGE_SIZE, T.i32()) + + # We will send an image worth of data in and out + @FuncOp.from_py_func(memrefTyInOut, memrefTyInOut) + def copy(arg0, arg1): + + # The arguments are the input and output + @launch(operands=[arg0, arg1]) + def launch_body(a, b): + + # The arguments are still the input and the output + @segment(name="seg", operands=[a, b]) + def segment_body(arg2, arg3): + # We want to store our data in L1 memory + mem_space_l2 = IntegerAttr.get(T.i32(), MemorySpace.L2) + + # This is the type definition of the tile + tile_type_l2 = MemRefType.get( + shape=TILE_SIZE, + element_type=T.i32(), + memory_space=mem_space_l2, + ) + + # We must allocate a buffer of tile size for the input/output + tile_in_l2 = AllocOp(tile_type_l2, [], []) + + # The herd sizes correspond to the dimensions of the contiguous block of cores we are hoping to get. + # We just need one compute core, so we ask for a 1x1 herd + @herd(name="copyherd", sizes=[1, 1], operands=[arg2, arg3, tile_in_l2]) + def herd_body(tx, ty, sx, sy, a, b, my_l2_tile): + + # We want to store our data in L1 memory + mem_space_l1 = IntegerAttr.get(T.i32(), MemorySpace.L1) + + # This is the type definition of the tile + tile_type_l1 = MemRefType.get( + shape=TILE_SIZE, + element_type=T.i32(), + memory_space=mem_space_l1, + ) + + # We must allocate a buffer of tile size for the input/output + tile_in_l1 = AllocOp(tile_type_l1, [], []) + tile_out_l1 = AllocOp(tile_type_l1, [], []) + + dma_memcpy_nd( + my_l2_tile, + a, + src_offsets=[0, 0], + src_sizes=[TILE_HEIGHT, TILE_WIDTH], + src_strides=[IMAGE_WIDTH, 1], + ) + + # Copy a tile from the input image (a) into the L1 memory region (tile_in) + dma_memcpy_nd( + tile_in_l1, + my_l2_tile, + ) + + # Access every value in the tile + for j in range_(TILE_HEIGHT): + for i in range_(TILE_WIDTH): + # Load the input value from tile_in + val = load(tile_in_l1, [i, j]) + + # Store the output value in tile_out + store(val, tile_out_l1, [i, j]) + yield_([]) + yield_([]) + + # Copy the output tile into the output + dma_memcpy_nd( + b, + tile_out_l1, + dst_offsets=[0, 0], + dst_sizes=[TILE_HEIGHT, TILE_WIDTH], + dst_strides=[IMAGE_WIDTH, 1], + ) + + # Deallocate our L1 buffers + DeallocOp(tile_in_l1) + DeallocOp(tile_out_l1) + + +if __name__ == "__main__": + module = build_module() + print(module) diff --git a/python/air/dialects/_air_ops_ext.py b/python/air/dialects/_air_ops_ext.py index 8a5dd032d..749583ea4 100644 --- a/python/air/dialects/_air_ops_ext.py +++ b/python/air/dialects/_air_ops_ext.py @@ -23,6 +23,29 @@ def pyint_to_index(i): return arith.ConstantOp.create_index(i) if isinstance(i, int) else i +def get_region_operand_types(operands): + """ + Utility function to get the type of arguments given to region ops. + """ + operand_types = [] + for o in operands: + if isinstance(o, Value): + operand_types.append(o.type) + elif isinstance(o, OpView): + if len(o.results.types) != 1: + raise AttributeError( + f"Operation given to a region op as a parameter ({o}) has more " + "than one return type ({o.results.types}), which would lead to a mismatch " + "between number of operands and number of operand types" + ) + operand_types += o.results.types + else: + raise AttributeError( + f"Argument {o} is not a Value or an Operation: {type(o).mro()}" + ) + return operand_types + + class Launch(LaunchOp): """Specialization for LaunchOp class.""" @@ -48,7 +71,7 @@ def __init__( launch_operands=operands, sym_name=name, ) - operand_types = [s.type for s in sizes] * 2 + [o.type for o in operands] + operand_types = [s.type for s in sizes] * 2 + get_region_operand_types(operands) self.regions[0].blocks.append(*operand_types) @@ -74,7 +97,7 @@ def __init__( segment_operands=operands, sym_name=name, ) - operand_types = [s.type for s in sizes] * 2 + [o.type for o in operands] + operand_types = [s.type for s in sizes] * 2 + get_region_operand_types(operands) self.regions[0].blocks.append(*operand_types) @@ -102,7 +125,7 @@ def __init__( sym_name=name, link_with=link_with, ) - operand_types = [s.type for s in sizes] * 2 + [o.type for o in operands] + operand_types = [s.type for s in sizes] * 2 + get_region_operand_types(operands) self.regions[0].blocks.append(*operand_types)