Skip to content

Commit

Permalink
Access L2 Segment Allocation in Herd with Python Example (#668)
Browse files Browse the repository at this point in the history
  • Loading branch information
hunhoffe authored Jul 17, 2024
1 parent 4c0b95e commit fbdcad3
Show file tree
Hide file tree
Showing 6 changed files with 272 additions and 36 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,18 @@ def build_module():
ChannelOp("ChanOut")
ChannelOp("ToSelf")

# We want to store our data in L1 memory
mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1)
mem_space_l1 = IntegerAttr.get(T.i32(), MemorySpace.L1)
image_type_l1 = MemRefType.get(
shape=IMAGE_SIZE,
element_type=T.i32(),
memory_space=mem_space_l1,
)

# This is the type definition of the image
image_type = MemRefType.get(
mem_space_l2 = IntegerAttr.get(T.i32(), MemorySpace.L2)
image_type_l2 = MemRefType.get(
shape=IMAGE_SIZE,
element_type=T.i32(),
memory_space=mem_space,
memory_space=mem_space_l2,
)

# We will send an image worth of data in and out
Expand All @@ -47,51 +51,43 @@ def launch_body(a, b):
@segment(name="seg")
def segment_body():

tensor_in_l2 = AllocOp(image_type_l2, [], [])
ChannelGet("ChanIn", tensor_in_l2)

# The herd sizes correspond to the dimensions of the contiguous block of cores we are hoping to get.
# We just need one compute core, so we ask for a 1x1 herd
@herd(name="copyherd", sizes=[1, 1])
def herd_body(tx, ty, sx, sy):
@herd(
name="copyherd",
sizes=[1, 1],
operands=[tensor_in_l2],
)
def herd_body(tx, ty, sx, sy, tensor_in_l2):

# We must allocate a buffer of image size for the input/output
tensor_in = AllocOp(image_type, [], [])
tensor_out = AllocOp(image_type, [], [])
tensor_in2 = AllocOp(image_type, [], [])
tensor_out2 = AllocOp(image_type, [], [])
tensor_in_l1 = AllocOp(image_type_l1, [], [])
tensor_out_l1 = AllocOp(image_type_l1, [], [])

ChannelGet("ChanIn", tensor_in)
ChannelPut("ToSelf", tensor_in_l2)
ChannelGet("ToSelf", tensor_in_l1)

# Access every value in the tile
for j in range_(IMAGE_HEIGHT):
for i in range_(IMAGE_WIDTH):
# Load the input value from tile_in
val = load(tensor_in, [i, j])
val = load(tensor_in_l1, [i, j])

# Store the output value in tile_out
store(val, tensor_out, [i, j])
store(val, tensor_out_l1, [i, j])
yield_([])
yield_([])

ChannelPut("ToSelf", tensor_out)
ChannelGet("ToSelf", tensor_in2)

# Access every value in the tile
for j in range_(IMAGE_HEIGHT):
for i in range_(IMAGE_WIDTH):
# Load the input value from tile_in
val = load(tensor_in2, [i, j])

# Store the output value in tile_out
store(val, tensor_out2, [i, j])
yield_([])
yield_([])

ChannelPut("ChanOut", tensor_out2)
ChannelPut("ChanOut", tensor_out_l1)

# Deallocate our L1 buffers
DeallocOp(tensor_in)
DeallocOp(tensor_out)
DeallocOp(tensor_in2)
DeallocOp(tensor_out2)
DeallocOp(tensor_in_l1)
DeallocOp(tensor_out_l1)

DeallocOp(tensor_in_l2)


if __name__ == "__main__":
Expand Down
12 changes: 12 additions & 0 deletions programming_examples/segment_alloc/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# (c) Copyright 2024 Advanced Micro Devices, Inc.
# SPDX-License-Identifier: MIT
srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))

targetname := $(shell basename ${srcdir})

run:
mkdir -p build
cd build && ${powershell} python3 ${srcdir}/run.py

clean:
rm -rf build __pycache__
88 changes: 88 additions & 0 deletions programming_examples/segment_alloc/run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# run.py -*- Python -*-
#
# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
# SPDX-License-Identifier: MIT
# Copyright (C) 2024, Advanced Micro Devices, Inc.
# SPDX-License-Identifier: MIT

import argparse
import numpy as np
import air.backend.xrt as xrt_backend
import filelock

from segment_alloc import *

INOUT_DATATYPE = np.uint32
INOUT_ELEM_SIZE = np.dtype(INOUT_DATATYPE).itemsize
INOUT_SIZE = IMAGE_SIZE[0] * IMAGE_SIZE[1]
INOUT_SIZE_BYTES = INOUT_SIZE * INOUT_ELEM_SIZE


def main(verbose=False, experimental_passes=False):
mlir_module = build_module()

input_a = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
output_b = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
for i in range(INOUT_SIZE):
input_a[i] = i + 0x1000
output_b[i] = 0x00DEFACED

backend = xrt_backend.XRTBackend(
verbose=verbose,
experimental_passes=experimental_passes,
omit_while_true_loop=True,
)

# run the module
with filelock.FileLock("/tmp/npu.lock"):
mul = backend.compile_and_load(mlir_module)
(_, output_b) = mul(input_a, output_b)

backend.unload()

# check output, should have the top left filled in
errors = 0
for i in range(INOUT_SIZE):
rb = output_b[i]

row = i / IMAGE_WIDTH
col = i % IMAGE_WIDTH

if row < TILE_HEIGHT and col < TILE_WIDTH:
# value should have been updated
if not (rb == 0x1000 + i):
print(f"IM {i} [{col}, {row}] should be 0x{i:x}, is 0x{rb:x}\n")
errors += 1
else:
# value should stay unchanged
if rb != 0x00DEFACED:
print(
f"IM {i} [{col}, {row}] should be 0xdefaced, is 0x{rb:x}\n",
i,
col,
row,
rb,
)
errors += 1

if errors == 0:
print("PASS!")
exit(0)
else:
print("failed. errors=", errors)
exit(-1)


if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog="run.py",
description="Builds, runs, and tests the segment_alloc example",
)

parser.add_argument(
"-v",
"--verbose",
action="store_true",
)
args = parser.parse_args()
main(experimental_passes=True, verbose=args.verbose)
8 changes: 8 additions & 0 deletions programming_examples/segment_alloc/run_makefile.lit
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
// (c) Copyright 2024 Advanced Micro Devices, Inc.
// SPDX-License-Identifier: MIT
//
// REQUIRES: ryzen_ai
//
// RUN: make -f %S/Makefile clean
// RUN: make -f %S/Makefile run | FileCheck %s
// CHECK: PASS!
109 changes: 109 additions & 0 deletions programming_examples/segment_alloc/segment_alloc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
# Copyright (C) 2024, Advanced Micro Devices, Inc.
# SPDX-License-Identifier: MIT

from air.ir import *
from air.dialects.air import *
from air.dialects.memref import AllocOp, DeallocOp, load, store
from air.dialects.func import FuncOp
from air.dialects.scf import for_, yield_

range_ = for_

IMAGE_WIDTH = 32
IMAGE_HEIGHT = 16
IMAGE_SIZE = [IMAGE_WIDTH, IMAGE_HEIGHT]

TILE_WIDTH = 16
TILE_HEIGHT = 8
TILE_SIZE = [TILE_WIDTH, TILE_HEIGHT]


@module_builder
def build_module():
memrefTyInOut = MemRefType.get(IMAGE_SIZE, T.i32())

# We will send an image worth of data in and out
@FuncOp.from_py_func(memrefTyInOut, memrefTyInOut)
def copy(arg0, arg1):

# The arguments are the input and output
@launch(operands=[arg0, arg1])
def launch_body(a, b):

# The arguments are still the input and the output
@segment(name="seg", operands=[a, b])
def segment_body(arg2, arg3):
# We want to store our data in L1 memory
mem_space_l2 = IntegerAttr.get(T.i32(), MemorySpace.L2)

# This is the type definition of the tile
tile_type_l2 = MemRefType.get(
shape=TILE_SIZE,
element_type=T.i32(),
memory_space=mem_space_l2,
)

# We must allocate a buffer of tile size for the input/output
tile_in_l2 = AllocOp(tile_type_l2, [], [])

# The herd sizes correspond to the dimensions of the contiguous block of cores we are hoping to get.
# We just need one compute core, so we ask for a 1x1 herd
@herd(name="copyherd", sizes=[1, 1], operands=[arg2, arg3, tile_in_l2])
def herd_body(tx, ty, sx, sy, a, b, my_l2_tile):

# We want to store our data in L1 memory
mem_space_l1 = IntegerAttr.get(T.i32(), MemorySpace.L1)

# This is the type definition of the tile
tile_type_l1 = MemRefType.get(
shape=TILE_SIZE,
element_type=T.i32(),
memory_space=mem_space_l1,
)

# We must allocate a buffer of tile size for the input/output
tile_in_l1 = AllocOp(tile_type_l1, [], [])
tile_out_l1 = AllocOp(tile_type_l1, [], [])

dma_memcpy_nd(
my_l2_tile,
a,
src_offsets=[0, 0],
src_sizes=[TILE_HEIGHT, TILE_WIDTH],
src_strides=[IMAGE_WIDTH, 1],
)

# Copy a tile from the input image (a) into the L1 memory region (tile_in)
dma_memcpy_nd(
tile_in_l1,
my_l2_tile,
)

# Access every value in the tile
for j in range_(TILE_HEIGHT):
for i in range_(TILE_WIDTH):
# Load the input value from tile_in
val = load(tile_in_l1, [i, j])

# Store the output value in tile_out
store(val, tile_out_l1, [i, j])
yield_([])
yield_([])

# Copy the output tile into the output
dma_memcpy_nd(
b,
tile_out_l1,
dst_offsets=[0, 0],
dst_sizes=[TILE_HEIGHT, TILE_WIDTH],
dst_strides=[IMAGE_WIDTH, 1],
)

# Deallocate our L1 buffers
DeallocOp(tile_in_l1)
DeallocOp(tile_out_l1)


if __name__ == "__main__":
module = build_module()
print(module)
29 changes: 26 additions & 3 deletions python/air/dialects/_air_ops_ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,29 @@ def pyint_to_index(i):
return arith.ConstantOp.create_index(i) if isinstance(i, int) else i


def get_region_operand_types(operands):
"""
Utility function to get the type of arguments given to region ops.
"""
operand_types = []
for o in operands:
if isinstance(o, Value):
operand_types.append(o.type)
elif isinstance(o, OpView):
if len(o.results.types) != 1:
raise AttributeError(
f"Operation given to a region op as a parameter ({o}) has more "
"than one return type ({o.results.types}), which would lead to a mismatch "
"between number of operands and number of operand types"
)
operand_types += o.results.types
else:
raise AttributeError(
f"Argument {o} is not a Value or an Operation: {type(o).mro()}"
)
return operand_types


class Launch(LaunchOp):
"""Specialization for LaunchOp class."""

Expand All @@ -48,7 +71,7 @@ def __init__(
launch_operands=operands,
sym_name=name,
)
operand_types = [s.type for s in sizes] * 2 + [o.type for o in operands]
operand_types = [s.type for s in sizes] * 2 + get_region_operand_types(operands)
self.regions[0].blocks.append(*operand_types)


Expand All @@ -74,7 +97,7 @@ def __init__(
segment_operands=operands,
sym_name=name,
)
operand_types = [s.type for s in sizes] * 2 + [o.type for o in operands]
operand_types = [s.type for s in sizes] * 2 + get_region_operand_types(operands)
self.regions[0].blocks.append(*operand_types)


Expand Down Expand Up @@ -102,7 +125,7 @@ def __init__(
sym_name=name,
link_with=link_with,
)
operand_types = [s.type for s in sizes] * 2 + [o.type for o in operands]
operand_types = [s.type for s in sizes] * 2 + get_region_operand_types(operands)
self.regions[0].blocks.append(*operand_types)


Expand Down

0 comments on commit fbdcad3

Please sign in to comment.