Skip to content

Commit

Permalink
Programming Example using ChannelOp size Parameter (#642)
Browse files Browse the repository at this point in the history
* Add channel size example

* Update documentation

* modify expected output for verification

* Fix a few small issues

* Make channel size standalone

* matrix scalar add example is working

* channel size example is working

* Realized that there is not really common code in the channel examples so made herd_to_herd standalone too. Also updated docs

* Clarify example documentation
  • Loading branch information
hunhoffe authored Jul 8, 2024
1 parent ae11b76 commit 6f6f639
Show file tree
Hide file tree
Showing 7 changed files with 250 additions and 40 deletions.
9 changes: 9 additions & 0 deletions programming_examples/channel_examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,13 @@ cd herd_to_herd
make clean && make
```

#### ```channel-size```: Use the channel size argument

This example ([channel_size/channel_size.py](channel_size/channel_size.py)) is a data passthrough example using the same tiling structure as the [matrix_scalar_add/multi_core_channel](../matrix_scalar_add/multi_core_channel.py) examples, only instead of using a separately defined channel for each tile/core, a bundle of channels is created (using the `ChannelOp` `size` parameter) and indexed into (the `ChannelGet` and `ChannelPut` `indices` parameter).

```bash
cd channel_size
make clean && make
```

#### WIP: more examples!
12 changes: 12 additions & 0 deletions programming_examples/channel_examples/channel_size/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Copyright (C) 2022, Advanced Micro Devices, Inc.
# SPDX-License-Identifier: MIT
srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))

targetname := $(shell basename ${srcdir})

run:
mkdir -p build
cd build && ${powershell} python3 ${srcdir}/run.py

clean:
rm -rf build __pycache__
124 changes: 124 additions & 0 deletions programming_examples/channel_examples/channel_size/channel_size.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
# Copyright (C) 2024, Advanced Micro Devices, Inc.
# SPDX-License-Identifier: MIT

from air.ir import *
from air.dialects.air import *
from air.dialects.memref import AllocOp, DeallocOp, load, store
from air.dialects.func import FuncOp
from air.dialects.scf import for_, yield_

range_ = for_

IMAGE_WIDTH = 32
IMAGE_HEIGHT = 16
IMAGE_SIZE = [IMAGE_WIDTH, IMAGE_HEIGHT]

TILE_WIDTH = 16
TILE_HEIGHT = 8
TILE_SIZE = [TILE_WIDTH, TILE_HEIGHT]

assert IMAGE_WIDTH % TILE_WIDTH == 0
assert IMAGE_HEIGHT % TILE_HEIGHT == 0


def format_name(prefix, index_0, index_1):
return f"{prefix}{index_0:02}{index_1:02}"


@module_builder
def build_module():
memrefTyInOut = MemRefType.get(IMAGE_SIZE, T.i32())

# Create an input/output channel pair per worker
ChannelOp("ChanIn", size=[IMAGE_WIDTH // TILE_WIDTH, IMAGE_HEIGHT // TILE_HEIGHT])
ChannelOp("ChanOut", size=[IMAGE_WIDTH // TILE_WIDTH, IMAGE_HEIGHT // TILE_HEIGHT])

# We will send an image worth of data in and out
@FuncOp.from_py_func(memrefTyInOut, memrefTyInOut)
def copy(arg0, arg1):

# The arguments are the input and output
@launch(operands=[arg0, arg1])
def launch_body(a, b):

# Transfer one tile of data per worker
for h in range(IMAGE_HEIGHT // TILE_HEIGHT):
for w in range(IMAGE_WIDTH // TILE_WIDTH):
offset0 = IMAGE_HEIGHT * h
offset1 = IMAGE_HEIGHT * w

# Put data into the channel tile by tile
ChannelPut(
"ChanIn",
a,
indices=[w, h],
offsets=[offset0, offset1],
sizes=[TILE_HEIGHT, TILE_WIDTH],
strides=[IMAGE_WIDTH, 1],
)

# Transfer one tile of data per worker
for h in range(IMAGE_HEIGHT // TILE_HEIGHT):
for w in range(IMAGE_WIDTH // TILE_WIDTH):
offset0 = IMAGE_HEIGHT * h
offset1 = IMAGE_HEIGHT * w

# Write data back out to the channel tile by tile
ChannelGet(
"ChanOut",
b,
indices=[w, h],
offsets=[offset0, offset1],
sizes=[TILE_HEIGHT, TILE_WIDTH],
strides=[IMAGE_WIDTH, 1],
)

# The arguments are still the input and the output
@segment(name="seg")
def segment_body():

@herd(
name="xaddherd",
sizes=[IMAGE_WIDTH // TILE_WIDTH, IMAGE_HEIGHT // TILE_HEIGHT],
)
def herd_body(th, tw, _sx, _sy):

# We want to store our data in L1 memory
mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1)

# This is the type definition of the tile
tile_type = MemRefType.get(
shape=TILE_SIZE,
element_type=T.i32(),
memory_space=mem_space,
)

# We must allocate a buffer of tile size for the input/output
tile_in = AllocOp(tile_type, [], [])
tile_out = AllocOp(tile_type, [], [])

# Copy a tile from the input image (a) into the L1 memory region (tile_in)
ChannelGet("ChanIn", tile_in, indices=[tw, th])

# Access every value in the tile
for j in range_(TILE_HEIGHT):
for i in range_(TILE_WIDTH):
# Load the input value from tile_in
val = load(tile_in, [i, j])

# Store the output value in tile_out
store(val, tile_out, [i, j])
yield_([])
yield_([])

# Copy the output tile into the output
ChannelPut("ChanOut", tile_out, indices=[tw, th])

# Deallocate our L1 buffers
DeallocOp(tile_in)
DeallocOp(tile_out)


if __name__ == "__main__":
module = build_module()
print(module)
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
# Copyright (C) 2024, Advanced Micro Devices, Inc.
# run.py -*- Python -*-
#
# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
# SPDX-License-Identifier: MIT

import argparse
import numpy as np
import air.backend.xrt as xrt_backend
import filelock

IMAGE_WIDTH = 32
IMAGE_HEIGHT = 16
IMAGE_SIZE = [IMAGE_WIDTH, IMAGE_HEIGHT]
from channel_size import *

INOUT_DATATYPE = np.uint32
INOUT_ELEM_SIZE = np.dtype(INOUT_DATATYPE).itemsize
Expand All @@ -30,10 +30,9 @@ def test_main(build_module, verbose=False):
input_a = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
input_b = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
for i in range(INOUT_SIZE):
input_a[i] = 0x2
input_b[i] = 0x00C0FFEE
input_a[i] = i + 0x1000
input_b[i] = 0x00DEFACED

# TODO(hunhoffe): need to figure out why single-core-dma fails with experimental_passes=True
backend = xrt_backend.XRTBackend(verbose=verbose, omit_while_true_loop=True)

if verbose:
Expand All @@ -53,12 +52,12 @@ def test_main(build_module, verbose=False):
errors = 0
for i in range(INOUT_SIZE):
rb = output_b[i]
expected_value = input_a[i]

row = i // IMAGE_WIDTH
col = i % IMAGE_WIDTH

# value should have been updated
expected_value = 0x2 * 0x2 + 1
if not (rb == expected_value):
"""
print(
Expand All @@ -73,3 +72,18 @@ def test_main(build_module, verbose=False):
else:
print("failed. errors=", errors)
exit(-1)


if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog="run.py",
description="Builds, runs, and tests the channel_examples/herd_to_herd example",
)

parser.add_argument(
"-v",
"--verbose",
action="store_true",
)
args = parser.parse_args()
test_main(build_module, verbose=args.verbose)
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
// (c) Copyright 2024 Advanced Micro Devices, Inc.
// SPDX-License-Identifier: MIT
//
// REQUIRES: ryzen_ai
//
// RUN: make -f %S/Makefile clean
// RUN: make -f %S/Makefile run | FileCheck %s
// CHECK: PASS!
18 changes: 3 additions & 15 deletions programming_examples/channel_examples/herd_to_herd/herd_to_herd.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,5 @@
# Copyright (C) 2024, Advanced Micro Devices, Inc.
# SPDX-License-Identifier: MIT
import sys
from pathlib import Path # if you haven't already done so

# Python paths are a bit complex. Taking solution from : https://stackoverflow.com/questions/16981921/relative-imports-in-python-3
file = Path(__file__).resolve()
parent, root = file.parent, file.parents[1]
sys.path.append(str(root))

# Additionally remove the current file's directory from sys.path
try:
sys.path.remove(str(parent))
except ValueError: # Already removed
pass

from air.ir import *
from air.dialects.air import *
Expand All @@ -24,8 +11,9 @@

range_ = for_


from common import *
IMAGE_WIDTH = 32
IMAGE_HEIGHT = 16
IMAGE_SIZE = [IMAGE_WIDTH, IMAGE_HEIGHT]


@module_builder
Expand Down
87 changes: 71 additions & 16 deletions programming_examples/channel_examples/herd_to_herd/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,77 @@
# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
# SPDX-License-Identifier: MIT
import argparse
import sys
from pathlib import Path # if you haven't already done so

# Python paths are a bit complex. Taking solution from : https://stackoverflow.com/questions/16981921/relative-imports-in-python-3
file = Path(__file__).resolve()
parent, root = file.parent, file.parents[1]
sys.path.append(str(root))

# Additionally remove the current file's directory from sys.path
try:
sys.path.remove(str(parent))
except ValueError: # Already removed
pass

from herd_to_herd.herd_to_herd import build_module
from common import test_main
import numpy as np
import air.backend.xrt as xrt_backend
import filelock

from herd_to_herd import *

INOUT_DATATYPE = np.uint32
INOUT_ELEM_SIZE = np.dtype(INOUT_DATATYPE).itemsize
INOUT_SIZE = IMAGE_SIZE[0] * IMAGE_SIZE[1]
INOUT_SIZE_BYTES = INOUT_SIZE * INOUT_ELEM_SIZE


def print_matrix(matrix_array):
for i in range(IMAGE_HEIGHT):
row = matrix_array[i * IMAGE_WIDTH : (i + 1) * IMAGE_WIDTH]
for val in row:
val = val & 0xFFFF
print(f"{val:04x}", end=" ")
print("")


def test_main(build_module, verbose=False):
mlir_module = build_module()

input_a = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
input_b = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
for i in range(INOUT_SIZE):
input_a[i] = 0x2
input_b[i] = 0x00C0FFEE

# TODO(hunhoffe): need to figure out why single-core-dma fails with experimental_passes=True
backend = xrt_backend.XRTBackend(verbose=verbose, omit_while_true_loop=True)

if verbose:
print_matrix(input_b)

# run the module
with filelock.FileLock("/tmp/npu.lock"):
addone = backend.compile_and_load(mlir_module)
(_, output_b) = addone(input_a, input_b)

backend.unload()

if verbose:
print_matrix(output_b)

# check output, should have all values incremented
errors = 0
for i in range(INOUT_SIZE):
rb = output_b[i]

row = i // IMAGE_WIDTH
col = i % IMAGE_WIDTH

# value should have been updated
expected_value = 0x2 * 0x2 + 1
if not (rb == expected_value):
"""
print(
f"IM {i} [{col}, {row}] should be 0x{expected_value:x}, is 0x{rb:x}\n"
)
"""
errors += 1

if errors == 0:
print("PASS!")
exit(0)
else:
print("failed. errors=", errors)
exit(-1)


if __name__ == "__main__":
parser = argparse.ArgumentParser(
Expand Down

0 comments on commit 6f6f639

Please sign in to comment.