diff --git a/programming_examples/channel_examples/README.md b/programming_examples/channel_examples/README.md
index e68915329..7ccfc3baa 100644
--- a/programming_examples/channel_examples/README.md
+++ b/programming_examples/channel_examples/README.md
@@ -22,4 +22,26 @@ cd channel_size
 make clean && make
 ```
 
+#### WIP: ```worker-to-self```:
+
+This example ([worker_to_self/worker_to_self.py](worker_to_self/worker_to_self.py)) is a work-in-progress data passthrough example using the same tiling structure as the [matrix_scalar_add/multi_core_channel](../matrix_scalar_add/multi_core_channel.py) examples, only the sole worker in the herd does some extra shuffling between input and output by putting the current data tile into a channel and then getting it from the same channel.
+
+WARNING: This example currently fails because it is assumed channel gets/parts are not from the same memory region, and this example breaks this assumption.
+
+```bash
+cd worker_to_self
+make clean && make
+```
+
+#### WIP: ```worker-to-worker```:
+
+This example ([worker_to_worker/worker_to_worker.py](worker_to_worker/worker_to_worker.py)) is a work-in-progress data passthrough example using the same tiling structure as the [matrix_scalar_add/multi_core_channel](../matrix_scalar_add/multi_core_channel.py) examples, only the each worker trades a tile of input data to another worker in the herd by sending it via channel.
+
+WARNING: This example currently fails for unknown reasons.
+
+```bash
+cd worker_to_worker
+make clean && make
+``
+
 #### WIP: more examples!
\ No newline at end of file
diff --git a/programming_examples/channel_examples/channel_size/channel_size.py b/programming_examples/channel_examples/channel_size/channel_size.py
index ef3d5a4b8..3f64db5e1 100644
--- a/programming_examples/channel_examples/channel_size/channel_size.py
+++ b/programming_examples/channel_examples/channel_size/channel_size.py
@@ -21,10 +21,6 @@
 assert IMAGE_HEIGHT % TILE_HEIGHT == 0
 
 
-def format_name(prefix, index_0, index_1):
-    return f"{prefix}{index_0:02}{index_1:02}"
-
-
 @module_builder
 def build_module():
     memrefTyInOut = MemRefType.get(IMAGE_SIZE, T.i32())
diff --git a/programming_examples/channel_examples/channel_size/run.py b/programming_examples/channel_examples/channel_size/run.py
index 39d8d40c0..0a3bb2dd3 100644
--- a/programming_examples/channel_examples/channel_size/run.py
+++ b/programming_examples/channel_examples/channel_size/run.py
@@ -33,7 +33,9 @@ def test_main(build_module, verbose=False):
         input_a[i] = i + 0x1000
         input_b[i] = 0x00DEFACED
 
-    backend = xrt_backend.XRTBackend(verbose=verbose, omit_while_true_loop=True)
+    backend = xrt_backend.XRTBackend(
+        verbose=verbose, experimental_passes=True, omit_while_true_loop=True
+    )
 
     if verbose:
         print_matrix(input_b)
diff --git a/programming_examples/channel_examples/worker_to_self/Makefile b/programming_examples/channel_examples/worker_to_self/Makefile
new file mode 100644
index 000000000..79be368b8
--- /dev/null
+++ b/programming_examples/channel_examples/worker_to_self/Makefile
@@ -0,0 +1,12 @@
+# (c) Copyright 2024 Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: MIT
+srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
+
+targetname := $(shell basename ${srcdir})
+
+run:
+	mkdir -p build
+	cd build && ${powershell} python3 ${srcdir}/run.py -v
+
+clean:
+	rm -rf build __pycache__
diff --git a/programming_examples/channel_examples/worker_to_self/run.py b/programming_examples/channel_examples/worker_to_self/run.py
new file mode 100644
index 000000000..c5b7825d5
--- /dev/null
+++ b/programming_examples/channel_examples/worker_to_self/run.py
@@ -0,0 +1,88 @@
+# run.py -*- Python -*-
+#
+# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: MIT
+import argparse
+import numpy as np
+import air.backend.xrt as xrt_backend
+import filelock
+
+from worker_to_self import *
+
+INOUT_DATATYPE = np.uint32
+INOUT_ELEM_SIZE = np.dtype(INOUT_DATATYPE).itemsize
+INOUT_SIZE = IMAGE_SIZE[0] * IMAGE_SIZE[1]
+INOUT_SIZE_BYTES = INOUT_SIZE * INOUT_ELEM_SIZE
+
+
+def print_matrix(matrix_array):
+    for i in range(IMAGE_HEIGHT):
+        row = matrix_array[i * IMAGE_WIDTH : (i + 1) * IMAGE_WIDTH]
+        for val in row:
+            val = val & 0xFFFF
+            print(f"{val:04x}", end=" ")
+        print("")
+
+
+def test_main(build_module, verbose=False):
+    mlir_module = build_module()
+
+    input_a = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
+    input_b = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
+    for i in range(INOUT_SIZE):
+        input_a[i] = i + 0x1000
+        input_b[i] = 0x00DEFACED
+
+    backend = xrt_backend.XRTBackend(verbose=verbose, omit_while_true_loop=True)
+
+    if verbose:
+        print_matrix(input_b)
+
+    # run the module
+    with filelock.FileLock("/tmp/npu.lock"):
+        addone = backend.compile_and_load(mlir_module)
+        (_, output_b) = addone(input_a, input_b)
+
+    backend.unload()
+
+    if verbose:
+        print_matrix(output_b)
+
+    # check output, should have all values incremented
+    errors = 0
+    for i in range(INOUT_SIZE):
+        rb = output_b[i]
+        expected_value = input_a[i]
+
+        # value should have been updated
+        if not (rb == expected_value):
+            """
+            row = i // IMAGE_WIDTH
+            col = i % IMAGE_WIDTH
+            print(
+                f"IM {i} [{col}, {row}] should be 0x{expected_value:x}, is 0x{rb:x}\n"
+            )
+            """
+            errors += 1
+
+    if errors == 0:
+        print("PASS!")
+        exit(0)
+    else:
+        print("failed. errors=", errors)
+        exit(-1)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="run.py",
+        description="Builds, runs, and tests the channel_examples/worker_to_self example",
+    )
+
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+    )
+    args = parser.parse_args()
+    test_main(build_module, verbose=args.verbose)
diff --git a/programming_examples/channel_examples/worker_to_self/run_makefile.lit b/programming_examples/channel_examples/worker_to_self/run_makefile.lit
new file mode 100644
index 000000000..07d105c0c
--- /dev/null
+++ b/programming_examples/channel_examples/worker_to_self/run_makefile.lit
@@ -0,0 +1,9 @@
+// (c) Copyright 2024 Advanced Micro Devices, Inc.
+ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ //
+ // REQUIRES: ryzen_ai
+ //
+ // RUN: make -f %S/Makefile clean
+ // RUN: make -f %S/Makefile run | FileCheck %s
+ // CHECK: PASS!
+ // XFAIL: *
diff --git a/programming_examples/channel_examples/worker_to_self/worker_to_self.py b/programming_examples/channel_examples/worker_to_self/worker_to_self.py
new file mode 100644
index 000000000..8060fe363
--- /dev/null
+++ b/programming_examples/channel_examples/worker_to_self/worker_to_self.py
@@ -0,0 +1,99 @@
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: MIT
+
+from air.ir import *
+from air.dialects.air import *
+from air.dialects.memref import AllocOp, DeallocOp, load, store
+from air.dialects.func import FuncOp
+from air.dialects.scf import for_, yield_
+
+range_ = for_
+
+IMAGE_WIDTH = 32
+IMAGE_HEIGHT = 16
+IMAGE_SIZE = [IMAGE_WIDTH, IMAGE_HEIGHT]
+
+
+@module_builder
+def build_module():
+
+    # Type and method of input/output
+    memrefTyInOut = T.MemRefType.get(IMAGE_SIZE, T.i32())
+    ChannelOp("ChanIn")
+    ChannelOp("ChanOut")
+    ChannelOp("ToSelf")
+
+    # We want to store our data in L1 memory
+    mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1)
+
+    # This is the type definition of the image
+    image_type = MemRefType.get(
+        shape=IMAGE_SIZE,
+        element_type=T.i32(),
+        memory_space=mem_space,
+    )
+
+    # We will send an image worth of data in and out
+    @FuncOp.from_py_func(memrefTyInOut, memrefTyInOut)
+    def copy(arg0, arg1):
+
+        # The arguments are the input and output
+        @launch(operands=[arg0, arg1])
+        def launch_body(a, b):
+            ChannelPut("ChanIn", a)
+            ChannelGet("ChanOut", b)
+
+            # The arguments are still the input and the output
+            @segment(name="seg")
+            def segment_body():
+
+                # The herd sizes correspond to the dimensions of the contiguous block of cores we are hoping to get.
+                # We just need one compute core, so we ask for a 1x1 herd
+                @herd(name="copyherd", sizes=[1, 1])
+                def herd_body(tx, ty, sx, sy):
+
+                    # We must allocate a buffer of image size for the input/output
+                    tensor_in = AllocOp(image_type, [], [])
+                    tensor_out = AllocOp(image_type, [], [])
+                    tensor_in2 = AllocOp(image_type, [], [])
+                    tensor_out2 = AllocOp(image_type, [], [])
+
+                    ChannelGet("ChanIn", tensor_in)
+
+                    # Access every value in the tile
+                    for j in range_(IMAGE_HEIGHT):
+                        for i in range_(IMAGE_WIDTH):
+                            # Load the input value from tile_in
+                            val = load(tensor_in, [i, j])
+
+                            # Store the output value in tile_out
+                            store(val, tensor_out, [i, j])
+                            yield_([])
+                        yield_([])
+
+                    ChannelPut("ToSelf", tensor_out)
+                    ChannelGet("ToSelf", tensor_in2)
+
+                    # Access every value in the tile
+                    for j in range_(IMAGE_HEIGHT):
+                        for i in range_(IMAGE_WIDTH):
+                            # Load the input value from tile_in
+                            val = load(tensor_in2, [i, j])
+
+                            # Store the output value in tile_out
+                            store(val, tensor_out2, [i, j])
+                            yield_([])
+                        yield_([])
+
+                    ChannelPut("ChanOut", tensor_out2)
+
+                    # Deallocate our L1 buffers
+                    DeallocOp(tensor_in)
+                    DeallocOp(tensor_out)
+                    DeallocOp(tensor_in2)
+                    DeallocOp(tensor_out2)
+
+
+if __name__ == "__main__":
+    module = build_module()
+    print(module)
diff --git a/programming_examples/channel_examples/worker_to_worker/Makefile b/programming_examples/channel_examples/worker_to_worker/Makefile
new file mode 100644
index 000000000..79be368b8
--- /dev/null
+++ b/programming_examples/channel_examples/worker_to_worker/Makefile
@@ -0,0 +1,12 @@
+# (c) Copyright 2024 Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: MIT
+srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
+
+targetname := $(shell basename ${srcdir})
+
+run:
+	mkdir -p build
+	cd build && ${powershell} python3 ${srcdir}/run.py -v
+
+clean:
+	rm -rf build __pycache__
diff --git a/programming_examples/channel_examples/worker_to_worker/run.py b/programming_examples/channel_examples/worker_to_worker/run.py
new file mode 100644
index 000000000..b62fc4746
--- /dev/null
+++ b/programming_examples/channel_examples/worker_to_worker/run.py
@@ -0,0 +1,88 @@
+# run.py -*- Python -*-
+#
+# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: MIT
+import argparse
+import numpy as np
+import air.backend.xrt as xrt_backend
+import filelock
+
+from worker_to_worker import *
+
+INOUT_DATATYPE = np.uint32
+INOUT_ELEM_SIZE = np.dtype(INOUT_DATATYPE).itemsize
+INOUT_SIZE = IMAGE_SIZE[0] * IMAGE_SIZE[1]
+INOUT_SIZE_BYTES = INOUT_SIZE * INOUT_ELEM_SIZE
+
+
+def print_matrix(matrix_array):
+    for i in range(IMAGE_HEIGHT):
+        row = matrix_array[i * IMAGE_WIDTH : (i + 1) * IMAGE_WIDTH]
+        for val in row:
+            val = val & 0xFFFF
+            print(f"{val:04x}", end=" ")
+        print("")
+
+
+def test_main(build_module, verbose=False):
+    mlir_module = build_module()
+
+    input_a = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
+    input_b = np.arange(1, INOUT_SIZE + 1, dtype=INOUT_DATATYPE)
+    for i in range(INOUT_SIZE):
+        input_a[i] = i + 0x1000
+        input_b[i] = 0x00DEFACED
+
+    backend = xrt_backend.XRTBackend(verbose=verbose, omit_while_true_loop=True)
+
+    if verbose:
+        print_matrix(input_b)
+
+    # run the module
+    with filelock.FileLock("/tmp/npu.lock"):
+        addone = backend.compile_and_load(mlir_module)
+        (_, output_b) = addone(input_a, input_b)
+
+    backend.unload()
+
+    if verbose:
+        print_matrix(output_b)
+
+    # check output, should have all values incremented
+    errors = 0
+    for i in range(INOUT_SIZE):
+        rb = output_b[i]
+        expected_value = input_a[i]
+
+        # value should have been updated
+        if not (rb == expected_value):
+            """
+            row = i // IMAGE_WIDTH
+            col = i % IMAGE_WIDTH
+            print(
+                f"IM {i} [{col}, {row}] should be 0x{expected_value:x}, is 0x{rb:x}\n"
+            )
+            """
+            errors += 1
+
+    if errors == 0:
+        print("PASS!")
+        exit(0)
+    else:
+        print("failed. errors=", errors)
+        exit(-1)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="run.py",
+        description="Builds, runs, and tests the channel_examples/worker_to_worker example",
+    )
+
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+    )
+    args = parser.parse_args()
+    test_main(build_module, verbose=args.verbose)
diff --git a/programming_examples/channel_examples/worker_to_worker/run_makefile.lit b/programming_examples/channel_examples/worker_to_worker/run_makefile.lit
new file mode 100644
index 000000000..07d105c0c
--- /dev/null
+++ b/programming_examples/channel_examples/worker_to_worker/run_makefile.lit
@@ -0,0 +1,9 @@
+// (c) Copyright 2024 Advanced Micro Devices, Inc.
+ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ //
+ // REQUIRES: ryzen_ai
+ //
+ // RUN: make -f %S/Makefile clean
+ // RUN: make -f %S/Makefile run | FileCheck %s
+ // CHECK: PASS!
+ // XFAIL: *
diff --git a/programming_examples/channel_examples/worker_to_worker/worker_to_worker.py b/programming_examples/channel_examples/worker_to_worker/worker_to_worker.py
new file mode 100644
index 000000000..e1b8fa256
--- /dev/null
+++ b/programming_examples/channel_examples/worker_to_worker/worker_to_worker.py
@@ -0,0 +1,171 @@
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: MIT
+
+from air.ir import *
+from air.dialects.air import *
+from air.dialects.memref import AllocOp, DeallocOp, load, store
+from air.dialects.func import FuncOp
+from air.dialects.scf import for_, yield_
+from air.dialects.affine import apply as affine_apply
+
+range_ = for_
+
+IMAGE_WIDTH = 32
+IMAGE_HEIGHT = 16
+IMAGE_SIZE = [IMAGE_WIDTH, IMAGE_HEIGHT]
+
+TILE_WIDTH = 16
+TILE_HEIGHT = 8
+TILE_SIZE = [TILE_WIDTH, TILE_HEIGHT]
+
+assert IMAGE_WIDTH % TILE_WIDTH == 0
+assert IMAGE_HEIGHT % TILE_HEIGHT == 0
+
+
+@module_builder
+def build_module():
+    memrefTyInOut = MemRefType.get(IMAGE_SIZE, T.i32())
+
+    # Create an input/output channel pair per worker
+    ChannelOp("ChanIn", size=[IMAGE_WIDTH // TILE_WIDTH, IMAGE_HEIGHT // TILE_HEIGHT])
+    ChannelOp("ChanOut", size=[IMAGE_WIDTH // TILE_WIDTH, IMAGE_HEIGHT // TILE_HEIGHT])
+    ChannelOp(
+        "SwitchTiles", size=[IMAGE_WIDTH // TILE_WIDTH, IMAGE_HEIGHT // TILE_HEIGHT]
+    )
+
+    # We will send an image worth of data in and out
+    @FuncOp.from_py_func(memrefTyInOut, memrefTyInOut)
+    def copy(arg0, arg1):
+
+        # The arguments are the input and output
+        @launch(operands=[arg0, arg1])
+        def launch_body(a, b):
+
+            # Transfer one tile of data per worker
+            for h in range(IMAGE_HEIGHT // TILE_HEIGHT):
+                for w in range(IMAGE_WIDTH // TILE_WIDTH):
+                    offset0 = IMAGE_HEIGHT * h
+                    offset1 = IMAGE_HEIGHT * w
+
+                    # Put data into the channel tile by tile
+                    ChannelPut(
+                        "ChanIn",
+                        a,
+                        indices=[w, h],
+                        offsets=[offset0, offset1],
+                        sizes=[TILE_HEIGHT, TILE_WIDTH],
+                        strides=[IMAGE_WIDTH, 1],
+                    )
+
+            # Transfer one tile of data per worker
+            for h in range(IMAGE_HEIGHT // TILE_HEIGHT):
+                for w in range(IMAGE_WIDTH // TILE_WIDTH):
+                    offset0 = IMAGE_HEIGHT * h
+                    offset1 = IMAGE_HEIGHT * w
+
+                    # Write data back out to the channel tile by tile
+                    ChannelGet(
+                        "ChanOut",
+                        b,
+                        indices=[w, h],
+                        offsets=[offset0, offset1],
+                        sizes=[TILE_HEIGHT, TILE_WIDTH],
+                        strides=[IMAGE_WIDTH, 1],
+                    )
+
+            # The arguments are still the input and the output
+            @segment(name="seg")
+            def segment_body():
+
+                @herd(
+                    name="xaddherd",
+                    sizes=[IMAGE_WIDTH // TILE_WIDTH, IMAGE_HEIGHT // TILE_HEIGHT],
+                )
+                def herd_body(th, tw, _sx, _sy):
+                    height_next = AffineMap.get(
+                        0,
+                        1,
+                        [
+                            AffineExpr.get_mod(
+                                AffineExpr.get_add(
+                                    AffineSymbolExpr.get(0),
+                                    AffineConstantExpr.get(1),
+                                ),
+                                IMAGE_HEIGHT // TILE_HEIGHT,
+                            )
+                        ],
+                    )
+                    width_next = AffineMap.get(
+                        0,
+                        1,
+                        [
+                            AffineExpr.get_mod(
+                                AffineExpr.get_add(
+                                    AffineSymbolExpr.get(0),
+                                    AffineConstantExpr.get(1),
+                                ),
+                                IMAGE_WIDTH // TILE_WIDTH,
+                            )
+                        ],
+                    )
+                    tw_next = affine_apply(width_next, [tw])
+                    th_next = affine_apply(height_next, [th])
+
+                    # We want to store our data in L1 memory
+                    mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1)
+                    tile_type = MemRefType.get(
+                        shape=TILE_SIZE,
+                        element_type=T.i32(),
+                        memory_space=mem_space,
+                    )
+
+                    # We must allocate a buffer of tile size for the input/output
+                    tile_in = AllocOp(tile_type, [], [])
+                    tile_in2 = AllocOp(tile_type, [], [])
+                    tile_out = AllocOp(tile_type, [], [])
+                    tile_out2 = AllocOp(tile_type, [], [])
+
+                    # Copy a tile from the input image
+                    ChannelGet("ChanIn", tile_in, indices=[tw, th])
+
+                    # Access every value in the tile
+                    for j in range_(TILE_HEIGHT):
+                        for i in range_(TILE_WIDTH):
+                            # Load the input value from tile_in
+                            val = load(tile_in, [i, j])
+
+                            # Store the output value in tile_out
+                            store(val, tile_out, [i, j])
+                            yield_([])
+                        yield_([])
+
+                    # Copy the output tile into a channel for another worker to get
+                    ChannelPut("SwitchTiles", tile_out, indices=[tw, th])
+
+                    # Get an output tile from another worker
+                    ChannelGet("SwitchTiles", tile_in2, indices=[tw_next, th_next])
+
+                    # Access every value in the tile
+                    for j in range_(TILE_HEIGHT):
+                        for i in range_(TILE_WIDTH):
+                            # Load the input value from tile_in
+                            val = load(tile_in2, [i, j])
+
+                            # Store the output value in tile_out
+                            store(val, tile_out2, [i, j])
+                            yield_([])
+                        yield_([])
+
+                    # Send the output tile to the output
+                    ChannelPut("ChanOut", tile_out, indices=[tw, th])
+
+                    # Deallocate our L1 buffers
+                    DeallocOp(tile_in)
+                    DeallocOp(tile_out)
+                    DeallocOp(tile_in2)
+                    DeallocOp(tile_out2)
+
+
+if __name__ == "__main__":
+    module = build_module()
+    print(module)