Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Channel Examples: Worker-to-self, Worker-to-worker #653

Merged
merged 20 commits into from
Jul 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions programming_examples/channel_examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,26 @@ cd channel_size
make clean && make
```

#### WIP: ```worker-to-self```:

This example ([worker_to_self/worker_to_self.py](worker_to_self/worker_to_self.py)) is a work-in-progress data passthrough example using the same tiling structure as the [matrix_scalar_add/multi_core_channel](../matrix_scalar_add/multi_core_channel.py) examples, only the sole worker in the herd does some extra shuffling between input and output by putting the current data tile into a channel and then getting it from the same channel.

WARNING: This example currently fails because it is assumed channel gets/parts are not from the same memory region, and this example breaks this assumption.

```bash
cd worker_to_self
make clean && make
```

#### WIP: ```worker-to-worker```:

This example ([worker_to_worker/worker_to_worker.py](worker_to_worker/worker_to_worker.py)) is a work-in-progress data passthrough example using the same tiling structure as the [matrix_scalar_add/multi_core_channel](../matrix_scalar_add/multi_core_channel.py) examples, only the each worker trades a tile of input data to another worker in the herd by sending it via channel.

WARNING: This example currently fails for unknown reasons.

```bash
cd worker_to_worker
make clean && make
``

#### WIP: more examples!
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,6 @@
assert IMAGE_HEIGHT % TILE_HEIGHT == 0


def format_name(prefix, index_0, index_1):
return f"{prefix}{index_0:02}{index_1:02}"


@module_builder
def build_module():
memrefTyInOut = MemRefType.get(IMAGE_SIZE, T.i32())
Expand Down
4 changes: 3 additions & 1 deletion programming_examples/channel_examples/channel_size/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@ def test_main(build_module, verbose=False):
input_a[i] = i + 0x1000
input_b[i] = 0x00DEFACED

backend = xrt_backend.XRTBackend(verbose=verbose, omit_while_true_loop=True)
backend = xrt_backend.XRTBackend(
verbose=verbose, experimental_passes=True, omit_while_true_loop=True
)

if verbose:
print_matrix(input_b)
Expand Down
12 changes: 12 additions & 0 deletions programming_examples/channel_examples/worker_to_self/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# (c) Copyright 2024 Advanced Micro Devices, Inc.
# SPDX-License-Identifier: MIT
srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))

targetname := $(shell basename ${srcdir})

run:
mkdir -p build
cd build && ${powershell} python3 ${srcdir}/run.py -v

clean:
rm -rf build __pycache__
88 changes: 88 additions & 0 deletions programming_examples/channel_examples/worker_to_self/run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# run.py -*- Python -*-
#
# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
# SPDX-License-Identifier: MIT
import argparse
import numpy as np
import air.backend.xrt as xrt_backend
import filelock

from worker_to_self import *

INOUT_DATATYPE = np.uint32
INOUT_ELEM_SIZE = np.dtype(INOUT_DATATYPE).itemsize
INOUT_SIZE = IMAGE_SIZE[0] * IMAGE_SIZE[1]
INOUT_SIZE_BYTES = INOUT_SIZE * INOUT_ELEM_SIZE


def print_matrix(matrix_array):
for i in range(IMAGE_HEIGHT):
row = matrix_array[i * IMAGE_WIDTH : (i + 1) * IMAGE_WIDTH]
for val in row:
val = val & 0xFFFF
print(f"{val:04x}", end=" ")
print("")


def test_main(build_module, verbose=False):
mlir_module = build_module()

input_a = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
input_b = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
for i in range(INOUT_SIZE):
input_a[i] = i + 0x1000
input_b[i] = 0x00DEFACED

backend = xrt_backend.XRTBackend(verbose=verbose, omit_while_true_loop=True)

if verbose:
print_matrix(input_b)

# run the module
with filelock.FileLock("/tmp/npu.lock"):
addone = backend.compile_and_load(mlir_module)
(_, output_b) = addone(input_a, input_b)

backend.unload()

if verbose:
print_matrix(output_b)

# check output, should have all values incremented
errors = 0
for i in range(INOUT_SIZE):
rb = output_b[i]
expected_value = input_a[i]

# value should have been updated
if not (rb == expected_value):
"""
row = i // IMAGE_WIDTH
col = i % IMAGE_WIDTH
print(
f"IM {i} [{col}, {row}] should be 0x{expected_value:x}, is 0x{rb:x}\n"
)
"""
errors += 1

if errors == 0:
print("PASS!")
exit(0)
else:
print("failed. errors=", errors)
exit(-1)


if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog="run.py",
description="Builds, runs, and tests the channel_examples/worker_to_self example",
)

parser.add_argument(
"-v",
"--verbose",
action="store_true",
)
args = parser.parse_args()
test_main(build_module, verbose=args.verbose)
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
// (c) Copyright 2024 Advanced Micro Devices, Inc.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// REQUIRES: ryzen_ai
//
// RUN: make -f %S/Makefile clean
// RUN: make -f %S/Makefile run | FileCheck %s
// CHECK: PASS!
// XFAIL: *
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# Copyright (C) 2024, Advanced Micro Devices, Inc.
# SPDX-License-Identifier: MIT

from air.ir import *
from air.dialects.air import *
from air.dialects.memref import AllocOp, DeallocOp, load, store
from air.dialects.func import FuncOp
from air.dialects.scf import for_, yield_

range_ = for_

IMAGE_WIDTH = 32
IMAGE_HEIGHT = 16
IMAGE_SIZE = [IMAGE_WIDTH, IMAGE_HEIGHT]


@module_builder
def build_module():

# Type and method of input/output
memrefTyInOut = T.MemRefType.get(IMAGE_SIZE, T.i32())
ChannelOp("ChanIn")
ChannelOp("ChanOut")
ChannelOp("ToSelf")

# We want to store our data in L1 memory
mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1)

# This is the type definition of the image
image_type = MemRefType.get(
shape=IMAGE_SIZE,
element_type=T.i32(),
memory_space=mem_space,
)

# We will send an image worth of data in and out
@FuncOp.from_py_func(memrefTyInOut, memrefTyInOut)
def copy(arg0, arg1):

# The arguments are the input and output
@launch(operands=[arg0, arg1])
def launch_body(a, b):
ChannelPut("ChanIn", a)
ChannelGet("ChanOut", b)

# The arguments are still the input and the output
@segment(name="seg")
def segment_body():

# The herd sizes correspond to the dimensions of the contiguous block of cores we are hoping to get.
# We just need one compute core, so we ask for a 1x1 herd
@herd(name="copyherd", sizes=[1, 1])
def herd_body(tx, ty, sx, sy):

# We must allocate a buffer of image size for the input/output
tensor_in = AllocOp(image_type, [], [])
tensor_out = AllocOp(image_type, [], [])
tensor_in2 = AllocOp(image_type, [], [])
tensor_out2 = AllocOp(image_type, [], [])

ChannelGet("ChanIn", tensor_in)

# Access every value in the tile
for j in range_(IMAGE_HEIGHT):
for i in range_(IMAGE_WIDTH):
# Load the input value from tile_in
val = load(tensor_in, [i, j])

# Store the output value in tile_out
store(val, tensor_out, [i, j])
yield_([])
yield_([])

ChannelPut("ToSelf", tensor_out)
ChannelGet("ToSelf", tensor_in2)

# Access every value in the tile
for j in range_(IMAGE_HEIGHT):
for i in range_(IMAGE_WIDTH):
# Load the input value from tile_in
val = load(tensor_in2, [i, j])

# Store the output value in tile_out
store(val, tensor_out2, [i, j])
yield_([])
yield_([])

ChannelPut("ChanOut", tensor_out2)

# Deallocate our L1 buffers
DeallocOp(tensor_in)
DeallocOp(tensor_out)
DeallocOp(tensor_in2)
DeallocOp(tensor_out2)


if __name__ == "__main__":
module = build_module()
print(module)
12 changes: 12 additions & 0 deletions programming_examples/channel_examples/worker_to_worker/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# (c) Copyright 2024 Advanced Micro Devices, Inc.
# SPDX-License-Identifier: MIT
srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))

targetname := $(shell basename ${srcdir})

run:
mkdir -p build
cd build && ${powershell} python3 ${srcdir}/run.py -v

clean:
rm -rf build __pycache__
88 changes: 88 additions & 0 deletions programming_examples/channel_examples/worker_to_worker/run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# run.py -*- Python -*-
#
# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
# SPDX-License-Identifier: MIT
import argparse
import numpy as np
import air.backend.xrt as xrt_backend
import filelock

from worker_to_worker import *

INOUT_DATATYPE = np.uint32
INOUT_ELEM_SIZE = np.dtype(INOUT_DATATYPE).itemsize
INOUT_SIZE = IMAGE_SIZE[0] * IMAGE_SIZE[1]
INOUT_SIZE_BYTES = INOUT_SIZE * INOUT_ELEM_SIZE


def print_matrix(matrix_array):
for i in range(IMAGE_HEIGHT):
row = matrix_array[i * IMAGE_WIDTH : (i + 1) * IMAGE_WIDTH]
for val in row:
val = val & 0xFFFF
print(f"{val:04x}", end=" ")
print("")


def test_main(build_module, verbose=False):
mlir_module = build_module()

input_a = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
input_b = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
for i in range(INOUT_SIZE):
input_a[i] = i + 0x1000
input_b[i] = 0x00DEFACED

backend = xrt_backend.XRTBackend(verbose=verbose, omit_while_true_loop=True)

if verbose:
print_matrix(input_b)

# run the module
with filelock.FileLock("/tmp/npu.lock"):
addone = backend.compile_and_load(mlir_module)
(_, output_b) = addone(input_a, input_b)

backend.unload()

if verbose:
print_matrix(output_b)

# check output, should have all values incremented
errors = 0
for i in range(INOUT_SIZE):
rb = output_b[i]
expected_value = input_a[i]

# value should have been updated
if not (rb == expected_value):
"""
row = i // IMAGE_WIDTH
col = i % IMAGE_WIDTH
print(
f"IM {i} [{col}, {row}] should be 0x{expected_value:x}, is 0x{rb:x}\n"
)
"""
errors += 1

if errors == 0:
print("PASS!")
exit(0)
else:
print("failed. errors=", errors)
exit(-1)


if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog="run.py",
description="Builds, runs, and tests the channel_examples/worker_to_worker example",
)

parser.add_argument(
"-v",
"--verbose",
action="store_true",
)
args = parser.parse_args()
test_main(build_module, verbose=args.verbose)
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
// (c) Copyright 2024 Advanced Micro Devices, Inc.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// REQUIRES: ryzen_ai
//
// RUN: make -f %S/Makefile clean
// RUN: make -f %S/Makefile run | FileCheck %s
// CHECK: PASS!
// XFAIL: *
Loading
Loading